-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_vector_add.cu
151 lines (116 loc) · 4.12 KB
/
01_vector_add.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
* A very simple CUDA example adding two arrays of ints together.
*
* Shows common paradigm of copy input data to device, copy results back to host
* Introduces many CUDA concepts
* - device pointers with cudaMalloc, cudaMemcpy
* - writing GPU kernel code (and getting threadIdx)
* - launching kernel
* - kernel launch dimensions, threads, blocks, grid
*
* Danny George 2012
*/
#include <stdio.h>
void do_the_add(int *a, int *b, int *r, int i);
const int N = 512 * 1024;
// initialize an array with a counting sequence
void fill_array_count(int *arr, const size_t n)
{
for (size_t i=0; i<n; ++i) {
arr[i] = (int)i;
}
}
// initialize an array with a constant number
void fill_array_const(int *arr, const size_t n, const int val)
{
for (size_t i=0; i<n; ++i) {
arr[i] = val;
}
}
// a CUDA kernel function
// the CUDA runtime spawns many parallel threads to execute it
// the executing thread id can be found through the threadIdx.[xyz] and blockIdx.[xyz] variables
// (this example doesn't spawn more than one block)
// the __global__ attribute tells the compiler that this is
// code that is called by the host and run on the device
__global__
void vector_add(int *a, int *b, int *r, const size_t n)
{
// convert from 2D launch to 1D array index
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= N)
return;
r[tid] = a[tid] + b[tid];
// you can call __device__ functions from __global__ functions
//do_the_add(a, b, r, tid);
}
// __device__ tells the compiler this function is called by the device and runs on the device
// __host__ tells the compiler to make another version to run on the host (normal function)
__device__ __host__
void do_the_add(int *a, int *b, int *r, int i)
{
r[i] = a[i] + b[i];
}
int main(int argc, char const *argv[])
{
int *host_a;
int *host_b;
int *host_r;
int *dev_a;
int *dev_b;
int *dev_r;
// NOTE: this example does no error checking!
cudaError_t err;
// ---- ALLOCATE MEMORY ON HOST -----------
host_a = (int *)malloc(sizeof(int) * N);
host_b = (int *)malloc(sizeof(int) * N);
host_r = (int *)malloc(sizeof(int) * N);
if (host_a == NULL || host_b == NULL || host_r == NULL) {
fprintf(stderr, "malloc error on host\n");
exit(1);
}
// ---- ALLOCATE MEMORY ON DEVICE ---------
// cudaMalloc(void **dev_ptr, size_t count)
err = cudaMalloc(&dev_a, sizeof(int) * N);
err = cudaMalloc(&dev_b, sizeof(int) * N);
err = cudaMalloc(&dev_r, sizeof(int) * N);
// ---- INITIALIZE DATA ON HOST -----------
fill_array_count(host_a, N);
fill_array_const(host_b, N, 10);
// ---- COPY DATA OVER TO DEVICE ----------
// cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind)
err = cudaMemcpy(dev_a, host_a, sizeof(int) * N, cudaMemcpyHostToDevice);
err = cudaMemcpy(dev_b, host_b, sizeof(int) * N, cudaMemcpyHostToDevice);
// ---- PERFORM COMPUTATION ON DEVICE -----
int threads_per_block = 128;
int blocks_per_grid = ((N + threads_per_block - 1) / threads_per_block); // integer div, ensures at least 1 block
vector_add<<<blocks_per_grid, threads_per_block>>>(dev_a, dev_b, dev_r, N);
// the <<<dim3 gridDim, dim3 blockDim>>> is a CUDA extension to launch kernels
// grids are made up of blocks
// blocks are made up of threads
// ---- COPY RESULT DATA BACK TO HOST ----
err = cudaMemcpy(host_r, dev_r, sizeof(int) * N, cudaMemcpyDeviceToHost);
// verify results
bool success = true;
for (size_t i=0; i<N; ++i) {
if (host_r[i] != host_a[i] + host_b[i]) {
fprintf(stderr, "ERROR [index %u]: %d != %d + %d", i, host_r[i], host_a[i], host_b[i]);
success = false;
break;
}
}
// ---- CLEANUP -------------------------
// free memory on host
free(host_a);
free(host_b);
free(host_r);
// free memory on device
err = cudaFree(dev_a);
err = cudaFree(dev_b);
err = cudaFree(dev_r);
if (success)
printf("It worked!\n");
else
return 1;
return 0;
}