Markopy
Utilizing Markov Models for brute forcing attacks
cudaModelMatrix.cu
Go to the documentation of this file.
1 /** @file cudaModelMatrix.cu
2  * @brief CUDA accelerated extension of Markov::API::ModelMatrix
3  * @authors Ata Hakçıl
4  *
5  * @copydoc Markov::API::CUDA::CUDAModelMatrix
6  */
7 
8 #include "cudaModelMatrix.h"
9 #include "cudarandom.h"
10 
11 
12 #include <curand_kernel.h>
13 #include <cuda.h>
14 #include <cuda_runtime.h>
15 #include <device_launch_parameters.h>
16 
17 using Markov::API::CUDA::CUDADeviceController;
18 
19 namespace Markov::API::CUDA{
21  cudaError_t cudastatus;
22 
23  cudastatus = cudaMalloc((char**)&(this->device_matrixIndex),
24  this->matrixSize*sizeof(char));
25  CudaCheckNotifyErr(cudastatus, "Cuda failed to initialize device_matrixIndex.");
26 
27  cudastatus = cudaMalloc((long int **)&(this->device_totalEdgeWeights), this->matrixSize*sizeof(long int));
28  CudaCheckNotifyErr(cudastatus, "Cuda failed to initialize device_totalEdgeWeights.");
29 
30  cudastatus = cudaMemcpy(this->device_matrixIndex, this->matrixIndex,
31  this->matrixSize*sizeof(char), cudaMemcpyHostToDevice);
32  CudaCheckNotifyErr(cudastatus, "Cuda failed to copy to device memory. (Index)");
33 
34  cudastatus = cudaMemcpy(this->device_totalEdgeWeights, this->totalEdgeWeights,
35  this->matrixSize*sizeof(long int), cudaMemcpyHostToDevice);
36  CudaCheckNotifyErr(cudastatus, "Cuda failed to copy to device memory. (Total Edge Values)");
37 
38  cudastatus = CudaMigrate2DFlat<char>(
39  &(this->device_edgeMatrix), this->edgeMatrix, this->matrixSize, this->matrixSize);
40  CudaCheckNotifyErr(cudastatus, " Cuda failed to initialize edge matrix.");
41 
42  cudastatus = CudaMigrate2DFlat<long int>(
43  &(this->device_valueMatrix), this->valueMatrix, this->matrixSize, this->matrixSize);
44  CudaCheckNotifyErr(cudastatus, " Cuda failed to initialize value matrix row.");
45 
46  }
47 
48  /*__host__ char* Markov::API::CUDA::CUDAModelMatrix::AllocVRAMOutputBuffer(long int n, long int singleGenMaxLen, long int CUDAKernelGridSize,long int sizePerGrid){
49  cudaError_t cudastatus;
50  cudastatus = cudaMalloc((char **)&this->device_outputBuffer1, CUDAKernelGridSize*sizePerGrid);
51  CudaCheckNotifyErr(cudastatus, "Failed to allocate VRAM buffer. (Possibly out of VRAM.)");
52 
53  return this->device_outputBuffer1;
54  }*/
55 
56 
57 
58  __host__ void Markov::API::CUDA::CUDAModelMatrix::FastRandomWalk(unsigned long int n, const char* wordlistFileName, int minLen, int maxLen, bool bFileIO, bool bInfinite){
59  cudaDeviceProp prop;
60  int device=0;
61  cudaGetDeviceProperties(&prop, device);
62  cudaChooseDevice(&device, &prop);
63  //std::cout << "Flattening matrix." << std::endl;
64  this->FlattenMatrix();
65  //std::cout << "Migrating matrix." << std::endl;
66  this->MigrateMatrix();
67  //std::cout << "Migrated matrix." << std::endl;
68  std::ofstream wordlist;
69  if(bFileIO)
70  wordlist.open(wordlistFileName);
71 
72 
73  cudaBlocks = 1024;
74  cudaThreads = 256;
83  this->prepKernelMemoryChannel(alternatingKernels);
84 
85  unsigned long int leftover = n - (totalOutputPerSync*numberOfPartitions);
86 
87  if(bInfinite && !numberOfPartitions) numberOfPartitions=5;
88  std::cerr << cudaPerKernelAllocationSize << "\n";
89 
90  if(n%totalOutputPerSync) std::cerr << "For optimization, request outputs muliples of "<< totalOutputPerSync << ".\n";
91 
92  //start kernelID 1
93  this->LaunchAsyncKernel(1, minLen, maxLen);
94 
95  for(int i=1;i<numberOfPartitions;i++){
96  if(bInfinite) i=0;
97 
98  //wait kernelID1 to finish, and start kernelID 0
99  cudaStreamSynchronize(this->cudastreams[1]);
100  this->LaunchAsyncKernel(0, minLen, maxLen);
101 
102  //start memcpy from kernel 1 (block until done)
103  this->GatherAsyncKernelOutput(1, bFileIO, wordlist);
104 
105  //wait kernelID 0 to finish, then start kernelID1
106  cudaStreamSynchronize(this->cudastreams[0]);
107  this->LaunchAsyncKernel(1, minLen, maxLen);
108 
109  //start memcpy from kernel 0 (block until done)
110  this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
111 
112  }
113 
114  //wait kernelID1 to finish, and start kernelID 0
115  cudaStreamSynchronize(this->cudastreams[1]);
116  this->LaunchAsyncKernel(0, minLen, maxLen);
117  this->GatherAsyncKernelOutput(1, bFileIO, wordlist);
118  cudaStreamSynchronize(this->cudastreams[0]);
119  this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
120 
121 
122  if(!leftover) return;
124  std::cerr << "Remaining line count (" << leftover << ") is lower than partition. Adjusting CUDA workload..\n";
126  this->LaunchAsyncKernel(0, minLen, maxLen);
127  cudaStreamSynchronize(this->cudastreams[0]);
128  this->GatherAsyncKernelOutput(0, bFileIO, wordlist);
129 
131  if(!leftover) return;
132 
133  std::cerr << "Remaining line count (" << leftover << ") is lower than minimum possible. Handing over to CPU generation.\n";
135 
136  leftover -= this->iterationsPerKernelThread;
137 
138  if(!leftover) return;
139  std::cerr << "Remaining " << leftover << " lines are absolutely not worth printing.\n";
140  Markov::API::ModelMatrix::ConstructMatrix();
141  Markov::API::ModelMatrix::FastRandomWalk(leftover, &wordlist, minLen, maxLen, 1, bFileIO);
142 
143  }
144 
146 
147  this->cudastreams = new cudaStream_t[numberOfStreams];
148  for(int i=0;i<numberOfStreams;i++)
149  cudaStreamCreate(&this->cudastreams[i]);
150 
151  this-> outputBuffer = new char*[numberOfStreams];
152  for(int i=0;i<numberOfStreams;i++)
154 
155  cudaError_t cudastatus;
156  this-> device_outputBuffer = new char*[numberOfStreams];
157  for(int i=0;i<numberOfStreams;i++){
158  cudastatus = cudaMalloc((char**)&(device_outputBuffer[i]), cudaPerKernelAllocationSize);
159  CudaCheckNotifyErr(cudastatus, "Failed to establish memory channel. Possibly out of VRAM?");
160  }
161 
162  this-> device_seeds = new unsigned long*[numberOfStreams];
163  for(int i=0;i<numberOfStreams;i++){
166  delete[] MEarr;
167  }
168 
169  }
170 
172 
173  //if(kernelID == 0);// cudaStreamSynchronize(this->cudastreams[2]);
174  //else cudaStreamSynchronize(this->cudastreams[kernelID-1]);
175  FastRandomWalkCUDAKernel<<<cudaBlocks,cudaThreads,0, this->cudastreams[kernelID]>>>(iterationsPerKernelThread, minLen, maxLen, this->device_outputBuffer[kernelID], this->device_matrixIndex,
176  this->device_totalEdgeWeights, this->device_valueMatrix, this->device_edgeMatrix, this->matrixSize, cudaMemPerGrid, this->device_seeds[kernelID]);
177  //std::cerr << "Started kernel" << kernelID << "\n";
178  }
179 
181  cudaMemcpy(this->outputBuffer[kernelID],this->device_outputBuffer[kernelID],cudaPerKernelAllocationSize, cudaMemcpyDeviceToHost);
182  //std::cerr << "Kernel" << kernelID << " output copied\n";
183  if(bFileIO){
184  for(long int j=0;j<cudaPerKernelAllocationSize;j+=cudaMemPerGrid){
185  wordlist << &this->outputBuffer[kernelID][j];
186  }
187  }else{
188  for(long int j=0;j<cudaPerKernelAllocationSize;j+=cudaMemPerGrid){
189  std::cout << &this->outputBuffer[kernelID][j];
190  }
191  }
192  }
193 
194  __global__ void FastRandomWalkCUDAKernel(unsigned long int n, int minLen, int maxLen, char* outputBuffer,
195  char* matrixIndex, long int* totalEdgeWeights, long int* valueMatrix, char *edgeMatrix, int matrixSize, int memoryPerKernelGrid, unsigned long *seed){
196 
197  int kernelWorkerIndex = threadIdx.x + blockIdx.x * blockDim.x;
198 
199  if(n==0) return;
200 
201  char* e;
202  int index = 0;
203  char next;
204  int len=0;
205  long int selection;
206  char cur;
207  long int bufferctr = 0;
208  unsigned long int *x,*y,*z,t;
209  char* res = &outputBuffer[kernelWorkerIndex*memoryPerKernelGrid];
210  x=&seed[kernelWorkerIndex*3];
211  y=&seed[kernelWorkerIndex*3+1];
212  z=&seed[kernelWorkerIndex*3+2];
213  for (int i = 0; i < n; i++) {
214  cur=199;
215  len=0;
216  while (true) {
217  e = strchr(matrixIndex, cur, matrixSize);
218  index = e - matrixIndex;
219  /*selection = Markov::API::CUDA::Random::devrandom(
220  seed[kernelWorkerIndex*3],
221  seed[kernelWorkerIndex*3+1],
222  seed[kernelWorkerIndex*3+2]) % totalEdgeWeights[index];*/
223  *x ^= *x << 16;
224  *x ^= *x >> 5;
225  *x ^= *x << 1;
226 
227  t = *x;
228  *x = *y;
229  *y = *z;
230  *z = t ^ *x ^ *y;
231  selection = *z % totalEdgeWeights[index];
232  for(int j=0;j<matrixSize-1;j++){
233  selection -= valueMatrix[index*matrixSize + j];
234  if (selection < 0){
235  next = edgeMatrix[index*sizeof(char)*matrixSize + j];
236  break;
237  }
238  }
239 
240  if (len >= maxLen) break;
241  else if ((next < 0) && (len < minLen)) continue;
242  else if (next < 0) break;
243  cur = next;
244  res[bufferctr + len++] = cur;
245  }
246  res[bufferctr + len++] = '\n';
247  bufferctr+=len;
248  }
249  res[bufferctr] = '\0';
250  }
251 
252  __device__ char* strchr(char* p, char c, int s_len){
253  for (;; ++p, s_len--) {
254  if (*p == c)
255  return((char *)p);
256  if (!*p)
257  return((char *)NULL);
258  }
259  }
260 
262  this->flatEdgeMatrix = new char[this->matrixSize*this->matrixSize];
263 
264  this->flatValueMatrix = new long int[this->matrixSize*this->matrixSize];
265  for(int i=0;i<this->matrixSize;i++){
266  memcpy(&this->flatEdgeMatrix[i*this->matrixSize], this->edgeMatrix[i], this->matrixSize );
267  memcpy(&this->flatValueMatrix[i*this->matrixSize], this->valueMatrix[i], this->matrixSize*sizeof(long int) );
268  }
269  }
270 
271 
272 };