12 #include <curand_kernel.h>
14 #include <cuda_runtime.h>
15 #include <device_launch_parameters.h>
21 cudaError_t cudastatus;
23 cudastatus = cudaMalloc((
char**)&(
this->device_matrixIndex),
24 this->matrixSize*
sizeof(
char));
25 CudaCheckNotifyErr(cudastatus,
"Cuda failed to initialize device_matrixIndex.");
27 cudastatus = cudaMalloc((
long int **)&(
this->device_totalEdgeWeights),
this->matrixSize*
sizeof(
long int));
28 CudaCheckNotifyErr(cudastatus,
"Cuda failed to initialize device_totalEdgeWeights.");
30 cudastatus = cudaMemcpy(
this->device_matrixIndex,
this->matrixIndex,
31 this->matrixSize*
sizeof(
char), cudaMemcpyHostToDevice);
32 CudaCheckNotifyErr(cudastatus,
"Cuda failed to copy to device memory. (Index)");
34 cudastatus = cudaMemcpy(
this->device_totalEdgeWeights,
this->totalEdgeWeights,
35 this->matrixSize*
sizeof(
long int), cudaMemcpyHostToDevice);
36 CudaCheckNotifyErr(cudastatus,
"Cuda failed to copy to device memory. (Total Edge Values)");
38 cudastatus = CudaMigrate2DFlat<
char>(
39 &(
this->device_edgeMatrix),
this->edgeMatrix,
this->matrixSize,
this->matrixSize);
40 CudaCheckNotifyErr(cudastatus,
" Cuda failed to initialize edge matrix.");
42 cudastatus = CudaMigrate2DFlat<
long int>(
43 &(
this->device_valueMatrix),
this->valueMatrix,
this->matrixSize,
this->matrixSize);
44 CudaCheckNotifyErr(cudastatus,
" Cuda failed to initialize value matrix row.");
61 cudaGetDeviceProperties(&prop, device);
62 cudaChooseDevice(&device, &prop);
64 this->FlattenMatrix();
66 this->MigrateMatrix();
68 std::ofstream wordlist;
70 wordlist.open(wordlistFileName);
83 this->prepKernelMemoryChannel(alternatingKernels);
93 this->LaunchAsyncKernel(1, minLen, maxLen);
99 cudaStreamSynchronize(
this->cudastreams[1]);
100 this->LaunchAsyncKernel(0, minLen, maxLen);
103 this->GatherAsyncKernelOutput(1, bFileIO, wordlist);
106 cudaStreamSynchronize(
this->cudastreams[0]);
107 this->LaunchAsyncKernel(1, minLen, maxLen);
110 this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
115 cudaStreamSynchronize(
this->cudastreams[1]);
116 this->LaunchAsyncKernel(0, minLen, maxLen);
117 this->GatherAsyncKernelOutput(1, bFileIO, wordlist);
118 cudaStreamSynchronize(
this->cudastreams[0]);
119 this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
122 if(!leftover)
return;
124 std::cerr <<
"Remaining line count (" << leftover <<
") is lower than partition. Adjusting CUDA workload..\n";
126 this->LaunchAsyncKernel(0, minLen, maxLen);
127 cudaStreamSynchronize(
this->cudastreams[0]);
128 this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
131 if(!leftover)
return;
133 std::cerr <<
"Remaining line count (" << leftover <<
") is lower than minimum possible. Handing over to CPU generation.\n";
138 if(!leftover)
return;
139 std::cerr <<
"Remaining " << leftover <<
" lines are absolutely not worth printing.\n";
140 Markov::API::ModelMatrix::ConstructMatrix();
141 Markov::API::ModelMatrix::FastRandomWalk(leftover, &wordlist, minLen, maxLen, 1, bFileIO);
147 this->cudastreams =
new cudaStream_t[numberOfStreams];
148 for(
int i=0;i<numberOfStreams;i++)
149 cudaStreamCreate(&
this->cudastreams[i]);
152 for(
int i=0;i<numberOfStreams;i++)
155 cudaError_t cudastatus;
157 for(
int i=0;i<numberOfStreams;i++){
158 cudastatus = cudaMalloc((
char**)&(device_outputBuffer[i]), cudaPerKernelAllocationSize);
159 CudaCheckNotifyErr(cudastatus,
"Failed to establish memory channel. Possibly out of VRAM?");
163 for(
int i=0;i<numberOfStreams;i++){
175 FastRandomWalkCUDAKernel<<<cudaBlocks,cudaThreads,0,
this->cudastreams[kernelID]>>>(iterationsPerKernelThread, minLen, maxLen,
this->device_outputBuffer[kernelID],
this->device_matrixIndex,
176 this->device_totalEdgeWeights,
this->device_valueMatrix,
this->device_edgeMatrix,
this->matrixSize, cudaMemPerGrid,
this->device_seeds[kernelID]);
181 cudaMemcpy(
this->outputBuffer[kernelID],
this->device_outputBuffer[kernelID],cudaPerKernelAllocationSize, cudaMemcpyDeviceToHost);
197 int kernelWorkerIndex = threadIdx.x + blockIdx.x * blockDim.x;
207 long int bufferctr = 0;
208 unsigned long int *x,*y,*z,t;
209 char* res = &outputBuffer[kernelWorkerIndex*memoryPerKernelGrid];
210 x=&seed[kernelWorkerIndex*3];
211 y=&seed[kernelWorkerIndex*3+1];
212 z=&seed[kernelWorkerIndex*3+2];
213 for (
int i = 0; i < n; i++) {
217 e = strchr(matrixIndex, cur, matrixSize);
218 index = e - matrixIndex;
231 selection = *z % totalEdgeWeights[index];
232 for(
int j=0;j<matrixSize-1;j++){
233 selection -= valueMatrix[index*matrixSize + j];
235 next = edgeMatrix[index*
sizeof(
char)*matrixSize + j];
240 if (len >= maxLen)
break;
241 else if ((next < 0) && (len < minLen))
continue;
242 else if (next < 0)
break;
244 res[bufferctr + len++] = cur;
246 res[bufferctr + len++] =
'\n';
249 res[bufferctr] =
'\0';
253 for (;; ++p, s_len--) {
257 return((
char *)NULL);
266 memcpy(&
this->flatEdgeMatrix[i*
this->matrixSize],
this->edgeMatrix[i],
this->matrixSize );
267 memcpy(&
this->flatValueMatrix[i*
this->matrixSize],
this->valueMatrix[i],
this->matrixSize*
sizeof(
long int) );