Многопроцессорные системы (продолжение). Графические ускорители. (Лекция18)

Слайд 2

Слайд 3

Слайд 4

Физическое представление GPU

Физическое представление GPU

Слайд 5

Логическое представление GPU (архитектура CUDA)

Логическое представление GPU (архитектура CUDA)

Слайд 6

Слайд 7

Слайд 8

#include #include float serial(float* f, long N); __global__ void summator(float* f,float*

#include
#include
float serial(float* f, long N);
__global__ void summator(float* f,float* s,

long N);
float parallel(float* f,long N, int num_of_blocks, int threads_per_block);
int main(int argc, char* argv[]){
long N;
int i;
float* fun;
int num_of_blocks, threads_per_block;
if(argc<4) { printf("USAGE: test1 \n"); return -1; }
Слайд 9

N=atoi(argv[1]); num_of_blocks=atoi(argv[2]); threads_per_block=atoi(argv[3]); fun=(float*)malloc(N*sizeof(float)); for(i=0;i fun[i]=((i+0.5F)*(1.0/N))*((i+0.5F)*(1.0/N)); printf("Serial calculation is over! Result=%g\n",

N=atoi(argv[1]);
num_of_blocks=atoi(argv[2]);
threads_per_block=atoi(argv[3]);
fun=(float*)malloc(N*sizeof(float));
for(i=0;i fun[i]=((i+0.5F)*(1.0/N))*((i+0.5F)*(1.0/N));
printf("Serial calculation is over! Result=%g\n",

serial(fun,N));
printf("Parallel calculation is over! Result=%g\n", parallel(fun,N,num_of_blocks, threads_per_block)); return 0;
}
Слайд 10

float serial(float* f, long N){ int i; double s=0.0; for(i=0;i s+=f[i]; return s/(float)N; }

float serial(float* f, long N){
int i;
double s=0.0;
for(i=0;i s+=f[i];
return s/(float)N;
}

Слайд 11

float parallel(float* f,long N, int num_of_blocks, int threads_per_block){ float* f_dev; float*

float parallel(float* f,long N, int num_of_blocks, int threads_per_block){
float* f_dev;
float*

s_dev;
float* s_host;
float s=0.0;
int i;
cudaMalloc((void **) &f_dev, N*sizeof(float) );
cudaMemcpy(f_dev, f, N*sizeof(float), cudaMemcpyHostToDevice);
Слайд 12

s_host=(float*)malloc( num_of_blocks*threads_per_block*sizeof(float))cudaMalloc((void **) &s_dev, num_of_blocks*threads_per_block*sizeof(float)); for(i=0;i s_host[i]=0.0; cudaMemcpy(s_dev, s_host, num_of_blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);

s_host=(float*)malloc( num_of_blocks*threads_per_block*sizeof(float))cudaMalloc((void **) &s_dev, num_of_blocks*threads_per_block*sizeof(float));
for(i=0;i s_host[i]=0.0;
cudaMemcpy(s_dev, s_host, num_of_blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);

Слайд 13

summator >>(f_dev, s_dev,N); cudaThreadSynchronize(); cudaMemcpy(s_host, s_dev, num_of_blocks*threads_per_block*sizeof(float) , cudaMemcpyDeviceToHost); for(i=0;i s+=s_host[i]; return s/(float)N; }

summator<<>>(f_dev, s_dev,N);
cudaThreadSynchronize();
cudaMemcpy(s_host, s_dev, num_of_blocks*threads_per_block*sizeof(float) , cudaMemcpyDeviceToHost);
for(i=0;i s+=s_host[i];
return s/(float)N;
}