import time
import numpy as np
from numba import vectorize
from numba import cuda

@vectorize
def SoucetPoleVectorize(A,B):
    return A+B

@cuda.jit
def SoucetPoleCuda(A,B,C):    
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x    
    gx = cuda.blockDim.x    
    pos = tx+bx*gx

    if(pos < A.size):
        C[pos] = A[pos]+B[pos]    
    
@cuda.jit
def SoucetPoleCuda2(A,B,C):    
    i = cuda.grid(1) # vytvori reference na vlakna sam
    
    if i < A.size:
        C[i] = A[i]+B[i]    
    

N = int(5e7)
A = np.random.randint(low=1,high=5,size=N)
B = np.random.randint(low=1,high=5,size=N)
C = np.zeros(N)

tic=time.time()
Cvect=SoucetPoleVectorize(A,B)
toc=time.time()
print('elapsed time vectorize',toc-tic)

# pouziti CUDA vypoctu bez streamu
tic=time.time()
blocksPerGrid=int(1e5)
ThreadsPerBlock=int(N/blocksPerGrid)
SoucetPoleCuda[blocksPerGrid,ThreadsPerBlock](A,B,C)
toc=time.time()
print('elapsed time CUDA-noMemory',toc-tic)


tic=time.time()
stream = cuda.stream()
Ad = cuda.to_device(A, stream=stream)
Bd = cuda.to_device(B, stream=stream)
Cd = cuda.to_device(C, stream=stream)
SoucetPoleCuda[blocksPerGrid,ThreadsPerBlock](Ad,Bd,Cd)
# kernel lze pustit asynchrone v ramci jednoho streamu = muzu predat cast pole kazdemu?
#SoucetPoleCuda2[blocksPerGrid,ThreadsPerBlock](Ad,Bd,Cd)
C = Cd.copy_to_host(stream=stream)
stream.synchronize()
toc=time.time()
print('elapsed time CUDA-memory',toc-tic)


print(np.sum(C-Cvect))