• [安装] mindelec gpu版本安装出错
    使用mindscience源码编译mindelec,缺省环境和conda环境(带pythonocc)安装mindspore 1.5和mindelec。mindspore没问题,但是mindelec引入时报以下错误:具体尝试过程参见:https://bbs.huaweicloud.com/forum/forum.php?mod=redirect&goto=findpost&ptid=167809&pid=1370660&fromuid=70062
  • [执行问题] ResNet101使用GPU进行训练时报错
    【报错内容】multiprocessing.context.TimeoutErrorRuntimeError: mindspore/ccsrc/backend/session/kernel_build_client.h:109 Response] Response is empty【操作步骤&问题现象】1、修改resnet101_imagenet2012_config.yaml中的训练集路径,更改类数量以适应新数据集2、在models/official/cv/resnet/下使用命令python train.py进行训练【截图信息】
  • [活动体验] lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释4
    ""lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释4 ```C++ //设置map缓冲区 void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); if (svm_capabilities) { if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) {// auto it = allocated_list_.find(host_ptr);//寻找相应的分配器列表 if (it == allocated_list_.end()) { MS_LOG(ERROR) "Map buffer failed, can not found buffer :" host_ptr; return nullptr; } int ret = ocl_runtime_->MapBuffer(host_ptr, flags, it->second->size_, static_cast(command_queue), sync);//创建mapbuffer if (ret != RET_OK) { MS_LOG(WARNING) "MapBuffer failed."; } } return host_ptr; } Lock(); auto it = allocated_list_.find(host_ptr);//寻找相应缓冲流 if (it == allocated_list_.end()) { UnLock(); MS_LOG(ERROR) "Map buffer failed, can not found buffer :" host_ptr; return nullptr; } if (it->second->map_flags_) {//判断Host ptr是否map UnLock(); MS_LOG(WARNING) "Host ptr " host_ptr " has mapped"; return host_ptr; } MemBuf *mem_buf = it->second; MS_ASSERT(mem_buf); if (mem_buf->mem_type_ == MemType::SHARED) {//判断该指针是否需要map UnLock(); MS_LOG(WARNING) "Host ptr " host_ptr " no need map"; return host_ptr; } void *new_host_ptr{nullptr};//初始化指针 if (mem_buf->mem_type_ == MemType::BUF) {//判断mem的type cl::Buffer *buffer = static_cast(mem_buf->device_ptr_);//转换类型 MS_ASSERT(buffer); new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); } else if (mem_buf->mem_type_ == MemType::IMG) { std::vector region{mem_buf->img_size_.width, mem_buf->img_size_.height, 1};//创建一个图片数据容器 cl::Image2D *image = static_cast(mem_buf->image_ptr_); MS_ASSERT(image); new_host_ptr = ocl_runtime_->MapBuffer(*image, sync, CL_MAP_READ | CL_MAP_WRITE, region); } if (new_host_ptr == nullptr) { UnLock(); MS_LOG(WARNING) "Map buffer failed, can not found buffer or already mapped, dev_ptr=" mem_buf->device_ptr_ ", host_ptr=" host_ptr; return nullptr; } mem_buf->map_flags_ = true; mem_buf->host_ptr_ = new_host_ptr; allocated_list_.erase(it); allocated_list_[new_host_ptr] = mem_buf; UnLock(); MS_LOG(DEBUG) "Map buffer form " host_ptr " to " new_host_ptr; return new_host_ptr; } //映射缓冲流 //此函数用于将一个缓冲区对象中的数据映射为客户端中的地址空间 int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); if (svm_capabilities) { if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) { return ocl_runtime_->UnmapBuffer(host_ptr); } return RET_OK; } auto it = allocated_list_.find(host_ptr); if (it == allocated_list_.end()) { MS_LOG(ERROR) "Map buffer failed, can not found buffer :" host_ptr; return RET_ERROR; } if (it->second->map_flags_) { it->second->map_flags_ = false; cl::Memory *mem = static_cast(it->second->mem_type_ == MemType::BUF ? it->second->device_ptr_ : it->second->image_ptr_); return ocl_runtime_->UnmapBuffer(*mem, it->second->host_ptr_, static_cast(command_queue)); } else { MS_LOG(WARNING) "Host ptr " host_ptr " do not mapped"; return RET_OK; } } //获取内存类型 MemType OpenCLAllocator::GetMemType(void *host_ptr) { MemType mem_type{MemType::BUF};//初始化内存对象 Lock(); auto it = allocated_list_.find(host_ptr);//查找本地指针 if (it == allocated_list_.end()) {//判断是否找到 UnLock(); MS_LOG(ERROR) "Can not found buffer :" host_ptr; return mem_type; } MemBuf *mem_buf = it->second;//寻找内存流 MS_ASSERT(mem_buf); mem_type = mem_buf->mem_type_;//输出内存类型 UnLock(); return mem_type;//返回类型 } //获取image的大小 int OpenCLAllocator::GetImageSize(void *host_ptr, ImageSize *img_size) { MS_ASSERT(img_size); Lock(); auto it = allocated_list_.find(host_ptr);//寻找该image缓冲流 if (it == allocated_list_.end()) { UnLock(); MS_LOG(ERROR) "Can not found buffer :" host_ptr; return RET_OK; } MemBuf *mem_buf = it->second; MS_ASSERT(mem_buf); if (mem_buf->mem_type_ == MemType::IMG) {//判断类型 *img_size = mem_buf->img_size_;//输出size } UnLock(); return RET_OK; } } // namespace mindspore::lite::opencl ```
  • [活动体验] lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释3
    ""lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释3 ```C++ //此函数用于释放空间 void OpenCLAllocator::Free(void *buf) { if (buf == nullptr) { return; } Lock(); auto iter = allocated_list_.find(buf);//获取分配器列表中的缓冲流 if (iter != allocated_list_.end()) { if (iter->second->map_flags_) { int ret = UnmapBuffer(buf);//进行映射缓冲区 if (ret != RET_OK) { MS_LOG(WARNING) "UnmapBuffer failed."; } iter->second->map_flags_ = false; } auto mem_buf = iter->second; allocated_list_.erase(iter);//释放缓冲流 free_list_.insert(std::make_pair(mem_buf->size_, mem_buf));//插入数据 UnLock(); MS_LOG(DEBUG) "Free device buffer. size: " mem_buf->size_ ", host addr: " mem_buf->host_ptr_ ", device addr: " mem_buf->device_ptr_ ", image addr: " mem_buf->image_ptr_ ", free list size: " free_list_.size(); return; } UnLock(); MS_LOG(WARNING) "Host ptr " buf " has freed";//输出警告窗口 } //计算分配器总的大小 size_t OpenCLAllocator::total_size() { Lock(); size_t totalSize = 0; for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {//遍历进行统计分配器的totalsize totalSize += it->second->size_; } for (auto it = free_list_.begin(); it != free_list_.end(); it++) {//遍历统计free的totalsize totalSize += it->second->size_; } UnLock(); return totalSize;//返回size } //获取图像 void *OpenCLAllocator::GetImage(void *buffer) { auto it = allocated_list_.find(buffer);//获取相应缓冲流 if (it != allocated_list_.end()) { return it->second->image_ptr_;//返回相应图像指针 } return nullptr; } //获取缓冲流 void *OpenCLAllocator::GetBuffer(void *buffer) { auto it = allocated_list_.find(buffer);//获取相应分配器列表 if (it != allocated_list_.end()) { return it->second->device_ptr_;//返回设备指针 } return nullptr; } template //设置一个T类型 //此函数用于清空列表 void OpenCLAllocator::ClearMemList(T *list) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();//获取支持向量机 for (auto it = list->begin(); it != list->end(); it++) {//遍历list进行设置映射缓冲区 if (it->second->map_flags_) { int ret = UnmapBuffer(it->second->host_ptr_); if (ret != RET_OK) { MS_LOG(WARNING) "UnmapBuffer failed."; } } if (svm_capabilities) { clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);//打开svm缓冲区 MS_LOG(DEBUG) "OpenCL free svm buffer : " it->second->host_ptr_; } else { cl::Buffer *buffer = static_cast(it->second->device_ptr_);//转换类型 MS_LOG(DEBUG) "OpenCL free device buffer : " buffer; if (buffer != nullptr) { delete buffer; it->second->device_ptr_ = nullptr; } cl::Image *image = static_cast(it->second->image_ptr_);//转换为image类型 if (image != nullptr) { delete image; it->second->image_ptr_ = nullptr; } if (it->second->mem_type_ == MemType::SHARED) {//判断类型是否正确 free(it->second->host_ptr_);//释放空间 it->second->host_ptr_ = nullptr; } } delete it->second; } list->clear(); } //清空函数 void OpenCLAllocator::Clear() { Lock(); ClearMemList>(&allocated_list_);//进行释放空间 ClearMemList>(&free_list_); UnLock(); } ```
  • [活动体验] lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释2 ```
    ""lite\src\runtime\gpu\opencl\opencl_allocator.cc""注释2 ```C++ //创建2D图像函数 void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, void *data, size_t flags, bool is_map, cl::Buffer **buffer, cl::Image2D **image) { cl_int ret = CL_SUCCESS; MS_ASSERT(buffer); MS_ASSERT(image); if (data == nullptr) {//当数据为无效值时 // copy from cl2.hpp cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, img_size.width, img_size.height, 0, 0, 0, 0, 0, 0, (**buffer).get()}; const cl::Context &context = *ocl_runtime_->Context();//获取复制环境 cl_image_format image_format{CL_RGBA, static_cast(img_size.dtype)}; *image = new (std::nothrow) cl::Image2D(clCreateImage(context.get(), 0, &image_format, &desc, nullptr, &ret)); } else { cl::ImageFormat image_format(CL_RGBA, img_size.dtype); *image = new (std::nothrow) cl::Image2D(*ocl_runtime_->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, img_size.width, img_size.height, 0, data, &ret);//创建一个2D图像空间 } if (*image == nullptr) { delete *buffer;//创建2D空间失败则删除缓冲区释放空间 MS_LOG(ERROR) "Create OpenCL Image2D failed! (ERROR CODE: " mindspore::kernel::CLErrorCode(ret) ")"; return nullptr; } if (ret != CL_SUCCESS) {//判断Cl缓冲区是否创建成功 delete *buffer; delete *image; MS_LOG(ERROR) "Create OpenCL Image2D (ERROR CODE: " mindspore::kernel::CLErrorCode(ret) ")"; return nullptr; } MS_LOG(DEBUG) "Malloc a new Image2D, width=" img_size.width ", height=" img_size.height;//分配2D空间大小提示 void *host_ptr = nullptr; if (is_map) { std::vector region{img_size.width, img_size.height, 1};//创建一个地址容器 host_ptr = ocl_runtime_->MapBuffer(**image, true, CL_MAP_READ | CL_MAP_WRITE, region);//创建一个图缓冲区 if (host_ptr == nullptr) { delete *buffer; delete *image; MS_LOG(ERROR) "Map image failed, can not found image :" *image ", host_ptr=" host_ptr; return nullptr; } cl::Memory *mem = *image; ret = ocl_runtime_->UnmapBuffer(*mem, host_ptr); if (ret != CL_SUCCESS) { MS_LOG(WARNING) "UnmapBuffer failed.";//取消映射缓冲区失败 } } return host_ptr; } //获取 Img 类型大小 int OpenCLAllocator::GetImgDtypeSize(const ImageSize &img_size) { size_t dtype_size = 0;//初始化size if (img_size.dtype == CL_FLOAT) { dtype_size = sizeof(cl_float); } else if (img_size.dtype == CL_HALF_FLOAT) { dtype_size = sizeof(cl_half); } else if (img_size.dtype == CL_SIGNED_INT8) {/进行存储size dtype_size = sizeof(cl_uchar); } else if (img_size.dtype == CL_SIGNED_INT32) { dtype_size = sizeof(cl_int); } else { MS_LOG(ERROR) "Unsupported dtype " img_size.dtype;//type不符合标准 return RET_ERROR; } uint32_t image_alignment = ocl_runtime_->GetImagePitchAlignment();//获取间距对齐的图像 size_t size = UP_ROUND(img_size.width, image_alignment) * img_size.height * C4NUM * dtype_size; return size; } //分配空间函数 void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const ImageSize &img_size) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();//获取支持向量机功能 auto enable_arm_import_memory = ocl_runtime_->isExtensionEnable(EXT_ARM_IMPORT_MEMORY_HOST);//判断是否能够扩展启用 if (mem_type == MemType::SHARED && !enable_arm_import_memory) { mem_type = MemType::BUF; } if (mem_type == MemType::IMG) { size = GetImgDtypeSize(img_size); } if (size > ocl_runtime_->GetMaxAllocSize()) {//Malloc 数据超出 max_size,分配的空间量太大 MS_LOG(ERROR) "MallocData out of max_size, size: " size; return nullptr; } Lock(); void *host_ptr = MinimumFit(mem_type, size, img_size);//最小拟合函数 UNLOCK_AND_RETURN_NULL(host_ptr != nullptr && data == nullptr, host_ptr); total_size_ += size; const uint64_t max_size = ocl_runtime_->GetGlobalMemSize() * 0.8;//获取全局内存大小 UNLOCK_AND_RETURN_NULL(total_size_ >= max_size, nullptr); cl::Buffer *buffer = nullptr; cl::Image2D *image = nullptr; cl_mem_flags flags = CL_MEM_READ_WRITE; if (svm_capabilities) { flags |= (svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0; flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0; host_ptr = clSVMAlloc((*ocl_runtime_->Context())(), flags, size, 0); } else { if (mem_type == MemType::SHARED) { size = UP_ROUND(size, ocl_runtime_->GetCacheLineSize());//获取缓存栈大小 host_ptr = malloc(size); UNLOCK_AND_RETURN_NULL(host_ptr == nullptr, nullptr); buffer = ocl_runtime_->CreateSharedMemoryBuffer(size, host_ptr);//创建共享内存缓冲区 } else { flags |= (data == nullptr) ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_COPY_HOST_PTR; if (mem_type == MemType::BUF || data == nullptr) { host_ptr = CreateBuffer(size, data, flags, &buffer);//创建缓冲流 UNLOCK_AND_RETURN_NULL(host_ptr == nullptr, nullptr); } if (mem_type == MemType::IMG) { void *host_ptr_im = CreateImage2D(size, img_size, data, flags, data != nullptr, &buffer, ℑ);//创建2Dimage UNLOCK_AND_RETURN_NULL(data != nullptr && host_ptr_im == nullptr, nullptr); host_ptr = (data != nullptr) ? host_ptr_im : host_ptr; } } } MemBuf *mem_buf = new (std::nothrow) MemBuf;//新建一个membuf对象 if (mem_buf == nullptr) {//判断是否有效 delete buffer; delete image; if (mem_type == MemType::SHARED) {//无效则释放资源 free(host_ptr); } UnLock(); return nullptr; } mem_buf->size_ = size;//初始化数据 mem_buf->device_ptr_ = static_cast(buffer); mem_buf->host_ptr_ = host_ptr; mem_buf->image_ptr_ = static_cast(image); mem_buf->mem_type_ = mem_type; mem_buf->img_size_ = img_size; allocated_list_[host_ptr] = mem_buf; UnLock(); std::string type_name = (mem_type == MemType::BUF) ? "buffer" : "Image2D";//获取类型名称 type_name = (mem_type == MemType::SHARED) ? "shared" : type_name; MS_LOG(DEBUG) "Malloc a new " type_name ". size: " mem_buf->size_ ", host addr: " mem_buf->host_ptr_ ", device addr: " mem_buf->device_ptr_ ", image_addr: " image ", total size: " total_size_;//输出分配的空间信息 return host_ptr; } ```
  • [执行问题] mindspore使用GPU单卡训练时,如何指定训练卡id
  • [活动体验] mindspore\lite\src\lite_session.cc"注释7
    ** mindspore\lite\src\lite_session.cc"注释7** ======================================= ```python //重置大小函数 int LiteSession::Resize(const std::vector &inputs, const std::vector> &dims) { bool expected = false;//初始化期望值 if (!is_running_.compare_exchange_strong(expected, true)) {//判断是否为多线程运行 MS_LOG(ERROR) "Not support multi-threading";//不支持多线程 return RET_ERROR; } std::vector> old_dims;//创建一个int类型的容器 for (size_t i = 0; i inputs_.size(); ++i) {//遍历添加所有的shape old_dims.push_back(inputs_[i]->shape()); } auto ret = ResizeInputs(inputs, dims);//重置输出值 if (ret != RET_OK) { ResetInputsShape(old_dims);//重置输出shape is_running_.store(false);//重置运行状态,释放资源 return ret; } ret = ReSizeKernels(kernels_);//重置内核 if (ret != RET_OK) {//判断是否重置成功 ResetInputsShape(old_dims);//重置输入shape auto resize_ret = ReSizeKernels(kernels_);//重置内核函数 if (resize_ret != RET_OK) {//判断是否重置成功 MS_LOG(ERROR) "restore kernel size fail!ret: " resize_ret; } is_running_.store(false);//释放资源 return ret; } is_running_.store(false); return RET_OK; } //初始化GPU运行时间 int LiteSession::InitGPURuntime() { #if GPU_OPENCL//当GPU处于开的时候 if (this->context_->IsGpuEnabled()) { opencl_runtime_wrapper_ = new (std::nothrow) opencl::OpenCLRuntimeWrapper();//创建开关运行状态包装器 if (opencl_runtime_wrapper_ == nullptr) { MS_LOG(ERROR) "create OpenCLRuntimeWrapper failed";//创建失败 return RET_ERROR; } auto gpu_device_info = this->context_->GetGpuInfo();//获取GPU信息 auto opencl_runtime = opencl_runtime_wrapper_->GetInstance();//获取实例 opencl_runtime->SetFp16Enable(gpu_device_info.enable_float16_);//设置FP16使用 if (opencl_runtime->Init() != RET_OK) {//判断初始化是否成功 this->context_->device_list_ = {{DT_CPU, {gpu_device_info.enable_float16_, MID_CPU}}}; MS_LOG(WARNING) "Init OpenCL runtime failed, change to CPU mode.";//初始化 OpenCL 运行时失败,更改为 CPU 模式 } else { MS_LOG(INFO) "Init OpenCL runtime success.";//初始化 } } #elif GPU_VULKAN if (this->context_->IsGpuEnabled()) {//是否启用 Gpu auto gpu_device_info = this->context_->GetGpuInfo();//获取GPU信息 vk_runtime_wrap_ = new (std::nothrow) gpu::GpuRuntimeWrapper;//创建包装器 if (vk_runtime_wrap_ == nullptr) {//判断是否创建成功 MS_LOG(ERROR) "create vk_runtime failed"; return RET_ERROR; } auto vk_runtime = vk_runtime_wrap_->GetInstance();//获取实例 vk_runtime->SetFp16Enable(gpu_device_info.enable_float16_);/设置FP16使用 if (vk_runtime->Init() != RET_OK) { this->context_->device_list_ = {{DT_CPU, {gpu_device_info.enable_float16_, MID_CPU}}}; MS_LOG(WARNING) "Init Vulkan runtime failed, change to CPU mode.";//Init Vulkan 运行时失败,更改为 CPU 模式 } else { MS_LOG(INFO) "Init Vulkan runtime success."; } } #endif return RET_OK; } } // namespace lite命名空间 //创建空间 session::LiteSession *session::LiteSession::CreateSession(const lite::Context *context) { auto session = new (std::nothrow) lite::LiteSession();//创建一个新的Session if (session == nullptr) {//创建 MS_LOG(ERROR) "create session failed"; return nullptr; } auto ret = session->Init(context);//初始化环境 if (ret != mindspore::lite::RET_OK) {//判断初始化是否成功 MS_LOG(ERROR) "init session failed"; delete session; return nullptr; } return session; } //创建对话空间 session::LiteSession *session::LiteSession::CreateSession(const char *model_buf, size_t size, const lite::Context *context) { auto *session = LiteSession::CreateSession(context);//创建Session if (session == nullptr) {//判断是否创建成功 MS_LOG(ERROR) "Create session failed"; return nullptr; } auto *model = lite::ImportFromBuffer(model_buf, size, true);//导入BUffer if (model == nullptr) {//判断是否导入成功 MS_LOG(ERROR) "Import model failed"; return nullptr; } auto ret = session->CompileGraph(model);//编译图 if (ret != lite::RET_OK) { MS_LOG(ERROR) "Compile model failed"; return nullptr; } model->buf = nullptr; (reinterpret_cast(session))->set_model(model);//转换类型 return session;//返回Session } } // namespace mindspore ```
  • [调优经验] 【MindSpore】DCPUE网络性能优化 -- GPU训练篇
    ### 问题描述 1. 训练条件: Linux Euler OS x86; 8 显卡; 物理 CPU 2; 每个物理 CPU 中的核数 26; 逻辑 CPU 104; MindSpore 1.2.0 TensorFlow 1.15.0 ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095802wsdrczahuywoafob.png) 2. GPU 训练时长 MindSpore: 1:58 TensorFlow: 1:04 3. 优化目的 在相同条件下,使用 MindSpore 框架训练网络的时长 **小于或等于** 使用 TensorFlow 框架训练的时长 ### 问题分析 1. 该网络为了防止过拟合,会基于同一模型同时训练多个神经网络。通过阅读代码,发现无论是 CPU 训练还是 GPU 训练,在训练多个神经网络时,会给每一个神经网络绑定一个逻辑 CPU 进行训练。 2. 监控 GPU 训练过程,发现训练时,被绑定的逻辑 CPU 资源占用率一直为100%,据此推测是网络训练过程中的一些与 CPU 有关的操作耗资源太大,导致训练总时长增加。 ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095815yvyhccsw76dh27u8.png) 3. 关闭代码中,每个网络绑定一个逻辑 CPU 进行训练的逻辑。再监控 GPU 训练过程,发现训练时,逻辑 CPU 资源占用率为250%左右。 ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095826er4oz1hwpnz6kan1.png) 4. 对比 TensorFlow,保留/注释每个网络绑定一个逻辑 CPU 进行训练的部分代码,监控 GPU 训练过程,发现绑定一个逻辑 CPU 时,被绑定的逻辑 CPU 资源占用率为100%;而当不绑定逻辑 CPU 时,逻辑 CPU 资源占用率为150%左右。 ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095840ri91icbltgvsqv07.png) -- ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095851hhjhe6jsdge0rdt6.png) 5. 对比 TensorFlow,MindSpore 框架训练该网络所需要的 CPU 资源更多。据此可以断定,使用 MindSpore 框架训练该网络的性能瓶颈在 CPU 相关的操作部分。 6. 由于在 GPU 上训练,CPU 相关的操作只涉及样本数据的处理、数据处理完后拷贝到 GPU 这些。咨询 MindSpore 框架负责数据处理部分的专家,了解到原因为: 1. 涉及 CPU 的主要操作为,在每一个 step,一个 batch 的数据输入一层网络,计算完成后,都需要做一次类型转换,才能够输入到下一层网络。所以这里存在 把数据从 GPU 拷贝至 CPU -> 数据类型转换 -> 把数据从 CPU 拷贝回 GPU 这三步(不一定准确,具体需要再看代码实现)。 2. 数据处理(数据同步和拷贝),用了 MindDataset 和 Batch 两个模块。在不额外设置线程数的情况下,默认会给每个模块分别分配2个线程(主线程做简单的同步,子线程并行地进行数据拷贝)。另外再加上一个 ctrl+c 线程,总共会分配5个线程。 3. 根据一个逻辑 CPU 处理两个线程的基本原则,不绑定逻辑 CPU 的情况下,5个线程就会占用约2.5个逻辑 CPU(这与之前的测试数据相吻合)。而在绑定逻辑 CPU 的情况下,单个逻辑 CPU 启用5个线程处理数据,会导致线程竞争,这可能是造成训练耗时比较长的主要原因。 ### 优化方案 1. 弃用 MindDataset 和 Batch 两个模块,改用 GeneratorDataset 模块(2个线程)对数据进行预处理以及负责数据的拷贝,这样从 线程数:2(MindDataset) + 2(Batch) + 1(ctrl + c) = 5 CPU数:5 / 2 = 2.5 降为 线程数:2(GeneratorDataset) + 1(ctrl + c)= 3 CPU数:3 / 2 = 1.5 2. 减少了所需要的 CPU 数,减少了线程数以降低线程竞争现象。最终达到的效果为,进行 GPU 训练时,GPU 训练性能基本与 TensorFlow 持平: MindSpore: 0:57 TensorFlow: 1:04 ### 反思 1. 同样是数据同步和拷贝,GeneratorDataset 模块只需要2个线程, 而MindDataset 和 Batch 两个模块需要4个线程。也就是说,2个线程可以做完的事情,分配了4个线程去做。这是不是 CPU 资源分配的不合理呢? 在本次网络训练中,数据样本量较小(7k行,3MB)。但尽管如此,在数据处理过程中,由于 Batch Size = N 是预先设置好的(超参数),每一个 step 中固定读取 N 条样本数据。实践证明,训练该网络,每一个 step 处理的这 N 条样本数据,3个线程也就是1.5个左右的逻辑 CPU 资源就够了。MindDataset 和 Batch 两个模块默认分配至少4个线程处理数据的设计,是值得讨论的。至少在这个网络对应的应用场景下,这两个模块的设计是可以优化的。 2. 其实,单看数据处理(数据同步和拷贝)这部分,GeneratorDataset 模块也需要2个线程,加上 ctrl + c 的1个线程,总共占用了3个线程也就是1.5个 CPU 资源,这是否合理呢? 3. 在不绑定逻辑 CPU 的情况下验证一下。发现使用MindDataset 和 Batch 两个模块时,虽然逻辑 CPU 占用率为250%左右,但是每个 CPU 的用户空间使用率并不高。进一步证实了 CPU 资源分配的不合理 ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095911lcwqchmhxgwshfxf.png) ![image.png](https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/01/095918nmte6rhm1txjt0du.png) 如果是在 GPU 上进行训练,那么网络的计算在 GPU 上,数据处理在 CPU 上,没有问题。但是如果在 CPU 上进行训练,那么网络计算所需要的资源也需要 CPU 提供。光是数据处理就占了1个逻辑 CPU,整个网络的训练性能可以达到什么程度,很难想象。
  • [分布式并行] MindSpore分布式并行训练—GPU平台通信方法
    ### 2.2 MindSpore的GPU平台通信 在GPU硬件平台上,MindSpore分布式并行训练的通信使用的是NCCL;采用的多进程通信库是OpenMPI。NCCL是Nvidia Collective multi-GPU Communication Library的简称,是英伟达提供的多GPU集合通信方案,在实现上参考了MPI接口,同时进行了诸多针对性优化。它是一个实现多GPU的collective communication通信(all-gather, reduce, broadcast)库,Nvidia做了很多优化,以在PCIe、Nvlink、InfiniBand上实现较高的通信速度。Open MPI 项目是一个开源消息传递接口实现,由学术、研究和行业合作伙伴组成的联盟开发和维护。 因此,Open MPI 能够结合来自高性能计算社区的所有专业知识、技术和资源,以构建可用的最佳 MPI 库。 Open MPI 为系统和软件供应商、应用程序开发人员和计算机科学研究人员提供了优势。MPI的相关配置内容通过文件`mindspore.parallel.mpi._mpi_config.py` 进行环境配置。 ```python # Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ MPI配置,用于配置MPI环境。 """ import threading from mindspore._c_expression import MpiConfig from mindspore._checkparam import args_type_check class _MpiConfig: """ _MpiConfig 是控制 MPI 的配置工具 Note: 不建议通过实例化 MpiConfig 对象来创建配置。 应该使用 MpiConfig() 来获取配置,因为 MpiConfig 是单例的。 """ _instance = None _instance_lock = threading.Lock() def __init__(self): self._mpiconfig_handle = MpiConfig.get_instance() def __new__(cls, *args, **kwargs): if cls._instance is None: cls._instance_lock.acquire() cls._instance = object.__new__(cls) cls._instance_lock.release() return cls._instance def __getattribute__(self, attr): value = object.__getattribute__(self, attr) if attr == "_mpiconfig_handle" and value is None: raise ValueError("mpiconfig handle is none in MpiConfig!!!") return value @property def enable_mpi(self): return self._mpiconfig_handle.get_enable_mpi() @enable_mpi.setter def enable_mpi(self, enable_mpi): self._mpiconfig_handle.set_enable_mpi(enable_mpi) _k_mpi_config = None def _mpi_config(): """ 获取全局的mpi config,如果没有创建mpi config,则新建一个。 Returns: _MpiConfig,全局 mpi 配置。 """ global _k_mpi_config if _k_mpi_config is None: _k_mpi_config = _MpiConfig() return _k_mpi_config @args_type_check(enable_mpi=bool) def _set_mpi_config(**kwargs): """ 为运行环境设置 mpi 配置。 应该在运行程序之前配置mpi config。如果没有配置,默认情况下,mpi 模块将被禁用。 Note: 设置属性时需要属性名称。 Args: enable_mpi (bool): 是否开启mpi。 默认值:False。 Raises: ValueError: 如果输入键不是 mpi 配置中的属性。 Examples: >>> mpiconfig.set_mpi_config(enable_mpi=True) """ for key, value in kwargs.items(): if not hasattr(_mpi_config(), key): raise ValueError("Set mpi config keyword %s is not recognized!" % key) setattr(_mpi_config(), key, value) def _get_mpi_config(attr_key): """ 根据输入键获取mpi config属性值。 Args: attr_key (str): 属性的键。 Returns: Object, 给定属性键的值。 Raises: ValueError: 如果输入键不是config中的属性。 """ if not hasattr(_mpi_config(), attr_key): raise ValueError("Get context keyword %s is not recognized!" % attr_key) return getattr(_mpi_config(), attr_key) ```
  • [安装经验] ubuntu18.04server系统(cuda11.1)环境下进行mindspore_gpu_1.5版本源码编译
    原文:https://www.cnblogs.com/devilmaycry812839668/p/15470501.html
  • [交流分享] 【模型转换 01】YoLo V3 V4模型Darknet2Caffe
    参考指导:https://pjreddie.com/darknet/yolo/参考代码:https://github.com/pjreddie/darknet一、原生darknet 跑通1.  git clone https://github.com/pjreddie/darknet.git2.  cd darknet; make;3.  nohup wget -c https://pjreddie.com/media/files/yolov3.weights &4.  ./darknet detect cfg/yolov3.cfg ./yolov3.weights data/dog.jpg 使用GPU参考链接: https://pjreddie.com/darknet/install/#cuda5.  修改makefile,编辑 Makefile中 GPU=1 6.  Make7.  可以使用参数 -i 指定使用的GPU./darknet -i 0 detect cfg/yolov3.cfg ../yolov3.weights ./data/person.jpg8.  如果编译使能了GPU,但测试时不想使用gpu,则可以使用 -nogpu 参数来使用CPU./darknet -nogpu detect cfg/yolov3.cfg ../yolov3.weights ./data/person.jpg9.  单张图片推理对比(608*608)二、Darknet2Caffe1、准备工作(1)docker pull bvlc/caffe:cpu(2)docker run -it --net=host --name=bvlc_caffe bvlc/caffe:cpu /bin/bash(3)export http_proxy=(4)apt-get update; apt-get install -y curl vim inetutils-ping(5)pip install --upgrade pip; pip install torch future(6)git config --global http.sslVerify false(7)git clone https://github.com/ChenYingpeng/darknet2caffe.git2、YoloV3转换caffemodel(1)把yolov3.cfg/weights 放到容器内docker cp yolov3.cfg bvlc_caffe:/workspace/darknet2caffe/medocker cp yolov3.weights bvlc_caffe:/workspace/darknet2caffe/me(2)增加upsample层,并编译caffea.    cp caffe_layers/upsample_layer/upsample_layer.hpp /opt/caffe/include/caffe/layers/b.    cp caffe_layers/upsample_layer/upsample_layer.cpp /opt/caffe/src/caffe/layers/c.    vim /opt/caffe/src/caffe/proto/caffe.proto  中增加upsample参数d.    /opt/caffe/build; rm -rf ./*; cmake ../; make -j 8e.    等待完成编译f.     cd /workspace/darknet2caffe(3)修改 darknet2caffe.py 中 caffe_root 的 路径,让其指向容器内caffe的路径,如下:(4)python darknet2caffe.py me/yolov3.cfg me/yolov3.weights yolov3_me.prototxt yolov3_me.caffemodel(5)遇到问题a.    Message type "caffe.LayerParameter" has no field named "upsample_param"        i.    解决办法,需要按增加upsample层并重训编译caffe3、YoloV4转换caffemodel(1)下载weights、cfg文件nohup wget -c --no-check-certificate https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights &wget -c --no-check-certificate  https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg(2)增加mish层a.    cp caffe_layers/mish_layer/mish_layer.hpp /opt/caffe/include/caffe/layers/b.    cp caffe_layers/mish_layer/mish_layer.cpp /opt/caffe/src/caffe/layers/c.    vim /opt/caffe/src/caffe/proto/caffe.proto  中增加mish参数d.    重新编译caffecd /opt/caffe/build; rm -rf ./*; cmake ../; make -j 8(3) 转换caffemodelpython darknet2caffe.py weights/yolov4.cfg weights/yolov4.weights yolov4_me.prototxt yolov4_me.caffemode(4) 问题a.    Unknown layer type: Mish (known types: AbsVal,        i.    解决办法              mish层没有增加并remake三、Caffemodel 转换为om1、YoloV3转换为om修改yolov3.prototxt,对比如下:在yolov3_acl.prototxt最后增加yolov3DetecionOutput层,如下:layer { bottom: "layer82-conv" top: "yolo1_coords" top: "yolo1_obj" top: "yolo1_classes" name: "yolo1" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { bottom: "layer94-conv" top: "yolo2_coords" top: "yolo2_obj" top: "yolo2_classes" name: "yolo2" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { bottom: "layer106-conv" top: "yolo3_coords" top: "yolo3_obj" top: "yolo3_classes" name: "yolo3" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { name: "detection_out3" type: "YoloV3DetectionOutput" bottom: "yolo1_coords" bottom: "yolo2_coords" bottom: "yolo3_coords" bottom: "yolo1_obj" bottom: "yolo2_obj" bottom: "yolo3_obj" bottom: "yolo1_classes" bottom: "yolo2_classes" bottom: "yolo3_classes" bottom: "img_info" top: "box_out" top: "box_out_num" yolov3_detection_output_param { boxes: 3 classes: 80 relative: true obj_threshold: 0.5 score_threshold: 0.5 iou_threshold: 0.45 pre_nms_topn: 512 post_nms_topn: 1024 biases_high: 10 biases_high: 13 biases_high: 16 biases_high: 30 biases_high: 33 biases_high: 23 biases_mid: 30 biases_mid: 61 biases_mid: 62 biases_mid: 45 biases_mid: 59 biases_mid: 119 biases_low: 116 biases_low: 90 biases_low: 156 biases_low: 198 biases_low: 373 biases_low: 326 } }转换命令:atc --model=./yolov3_acl.prototxt --weight=./yolov3.caffemodel --framework=0 --output=./yolov3_aipp_416_416 --soc_version=Ascend310 --insert_op_conf=./aipp_yolov3.cfg2、YoloV4转换为om在yolov4_acl.prototxt最后增加yolov3DetecionOutput层,如下:layer { bottom: "layer139-conv" top: "yolo1_coords" top: "yolo1_obj" top: "yolo1_classes" name: "yolo1" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { bottom: "layer150-conv" top: "yolo2_coords" top: "yolo2_obj" top: "yolo2_classes" name: "yolo2" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { bottom: "layer161-conv" top: "yolo3_coords" top: "yolo3_obj" top: "yolo3_classes" name: "yolo3" type: "Yolo" yolo_param { boxes: 3 coords: 4 classes: 80 yolo_version: "V3" softmax: true background: false } } layer { name: "detection_out3" type: "YoloV3DetectionOutput" bottom: "yolo1_coords" bottom: "yolo2_coords" bottom: "yolo3_coords" bottom: "yolo1_obj" bottom: "yolo2_obj" bottom: "yolo3_obj" bottom: "yolo1_classes" bottom: "yolo2_classes" bottom: "yolo3_classes" bottom: "img_info" top: "box_out" top: "box_out_num" yolov3_detection_output_param { boxes: 3 classes: 80 relative: true obj_threshold: 0.5 score_threshold: 0.5 iou_threshold: 0.45 pre_nms_topn: 512 post_nms_topn: 1024 biases_high: 142 biases_high: 110 biases_high: 192 biases_high: 243 biases_high: 459 biases_high: 401 biases_mid: 36 biases_mid: 75 biases_mid: 76 biases_mid: 55 biases_mid: 72 biases_mid: 146 biases_low: 12 biases_low: 16 biases_low: 19 biases_low: 36 biases_low: 40 biases_low: 28 } }转换命令:atc --model=./yolov4_acl.prototxt --weight=./yolov4.caffemodel --framework=0 --output=./yolov4_aipp_608_608 --soc_version=Ascend310 --insert_op_conf=./aipp_yolov4_608_608.cfg四、A310上跑通YoloV3、YoloV41、代码下载https://gitee.com/ascend/ascend-referenceapps/tree/master/ApiSamples/Samples/InferObjectDetectioncd src/Samples/InferObjectDetection2、编译代码./build.sh;cd dist;3、修改配置文件修改 data/config/setup.config中model_path路径,以及模型的宽高如 YoloV3 416*416,如下:YoloV4 608*608 如下:4、运行YoloV3结果如下:YoloV4结果如下:
  • [分布式] mindspore在gpu上进行多机分布式训练
    【功能模块】mindspore在gpu上进行多机分布式训练【操作步骤&问题现象】1、如何指定每个节点的 显卡编号?例如pytorch是通过这种方式,os.environ['CUDA_VISIBLE_DEVICES'] = '0,1',那mindspore 呢2、mpirun进行多机分布式时,mindspore是如何进行多机并行的,能否详细描述一下原理?具体来说,能否说明一下多个节点之间如何进行通信,数据和可执行代码需要放在多个节点的共享文件系统中吗,还是只需要放在主节点,再通过网络传输给其他节点?3、mindspore官方文档中关于gpu上的多机分布式训练的教程不够细致和完善,用户很难根据这个教程完成多机分布式训练【截图信息】如下图所示,我在进行两个不同服务器(分别为118和52服务器)上的多机gpu分布式,用的是mindspore官方提供的yolov4模型,代码和数据集均放在118服务器上,两个服务器之间已经做好ssh免密通信,但是两个服务器之间没有开启共享文件系统,然后在118服务器上运行 run_train_gpu.sh后,出现下面的错误,提示在52服务器上找不到可执行文件(这个原因是啥),我比较好奇是否一定要建立共享文件系统,可执行文件和数据不能通过网络传输到其他节点吗?麻烦大佬看一下,非常感谢!【日志信息】(可选,上传日志内容或者附件)
  • [执行问题] WSL2下调用mindspore使用GPU分类CIFAR-10数据集时显存溢出
    【功能模块】GPU为NVIDIA GeForce RTX 2060 with Max-Q Design,内存6G系统为windows11【操作步骤&问题现象】1、安装wsl2,anaconda3,在python3.7.5环境下运行mindspore1.5.0-rc12、对CIFAR-10数据集尝试使用ResNet-50实现图像分类3、可调用CPU进行分类,但调用GPU时提示cudaHostAlloc failed, ret[2], out of memory4、可以调用GPU完成对MNIST数据集实现图像分类【截图信息】【日志信息】(可选,上传日志内容或者附件)
  • [算子编译] 【Mindspore】【模型训练】Modelzoo SEResNext50_32*4d GPU初始化错误
    【功能模块】MindSpore  版本:1.5.0-rc1ubuntu18.04python3.7.5GPU CUDA10.1【操作步骤&问题现象】1、修改batchsize为32和数据及路径后直接运行报错Attr output_num 32must less than28  ,修改group为16后报错Attr output_num 16must less than14,修改group为7才能够正常运行2、上传至modelarts上与自己电脑上运行错误相同,同样group更改为7才能使用                      配置为GPU: 1*NVIDIA-V100(32GB) | CPU: 8 核 64GB[ERROR] KERNEL(3516,7f24a92a2740,python):2021-10-23-20:03:05.062.308 [mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h:144] CheckParam] Attr output_num 32must less than28[EXCEPTION] DEVICE(3516,7f24a92a2740,python):2021-10-23-20:03:05.062.651 [mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc:63] CreateGPUKernel] Initialize gpu kernel op[Default/network-TrainOneStepCell/network-WithLossCell/_backbone-SENet/layer2-SequentialCell/1-SEResNeXtBottleneck/conv2-GroupConv/Split-op137405] failed.Traceback (most recent call last):  File "/home/zxm/PycharmProjects/pythonProject3/train.py", line 288, in    model.train(cfg.epoch_size, dataset, callbacks=cbs)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/train/model.py", line 718, in train    sink_size=sink_size)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/train/model.py", line 502, in _train    self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/train/model.py", line 564, in _train_dataset_sink_process    outputs = self._train_network(*inputs)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/nn/cell.py", line 404, in __call__    out = self.compile_and_run(*inputs)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/nn/cell.py", line 682, in compile_and_run    self.compile(*inputs)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/nn/cell.py", line 669, in compile    _cell_graph_executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode)  File "/home/zxm/.local/lib/python3.7/site-packages/mindspore/common/api.py", line 542, in compile    result = self._graph_executor.compile(obj, args_list, phase, use_vm, self.queue_name)RuntimeError: mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc:63 CreateGPUKernel] Initialize gpu kernel op[Default/network-TrainOneStepCell/network-WithLossCell/_backbone-SENet/layer2-SequentialCell/1-SEResNeXtBottleneck/conv2-GroupConv/Split-op137405] failed.【截图信息】【日志信息】(可选,上传日志内容或者附件)
  • [执行问题] modelart上gpu版本mindspore cuda出错
    【功能模块】【操作步骤&问题现象】1、在model art上执行代码,前几个batch都能正常运行,在运行到50个batch左右时,报"an illegal memory access was encountered"的错误【截图信息】【日志信息】(可选,上传日志内容或者附件)