Содержание
- 4. Физическое представление GPU
- 5. Логическое представление GPU (архитектура CUDA)
- 8. #include #include float serial(float* f, long N); __global__ void summator(float* f,float* s, long N); float parallel(float*
- 9. N=atoi(argv[1]); num_of_blocks=atoi(argv[2]); threads_per_block=atoi(argv[3]); fun=(float*)malloc(N*sizeof(float)); for(i=0;i fun[i]=((i+0.5F)*(1.0/N))*((i+0.5F)*(1.0/N)); printf("Serial calculation is over! Result=%g\n", serial(fun,N)); printf("Parallel calculation is over!
- 10. float serial(float* f, long N){ int i; double s=0.0; for(i=0;i s+=f[i]; return s/(float)N; }
- 11. float parallel(float* f,long N, int num_of_blocks, int threads_per_block){ float* f_dev; float* s_dev; float* s_host; float s=0.0;
- 12. s_host=(float*)malloc( num_of_blocks*threads_per_block*sizeof(float))cudaMalloc((void **) &s_dev, num_of_blocks*threads_per_block*sizeof(float)); for(i=0;i s_host[i]=0.0; cudaMemcpy(s_dev, s_host, num_of_blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
- 13. summator >>(f_dev, s_dev,N); cudaThreadSynchronize(); cudaMemcpy(s_host, s_dev, num_of_blocks*threads_per_block*sizeof(float) , cudaMemcpyDeviceToHost); for(i=0;i s+=s_host[i]; return s/(float)N; }
- 15. Скачать презентацию