import numpy as np
from numba import cuda, float32, float64

from numba.core.errors import NumbaPerformanceWarning
import warnings

warnings.simplefilter('ignore', category=NumbaPerformanceWarning)


def sectiSekvenci1(A,suma):
    suma = 0.0
    for i in range(N):
        suma+=A[i]        
    return suma

@cuda.reduce
def sum_reduce(a, b):
    return a + b

@cuda.jit()
def sumArrCUDA(A,B):    

    N=A.shape[0]
    bx = cuda.blockIdx.x
    gx = cuda.blockDim.x
    
    tx = cuda.threadIdx.x 
    suma_shared = cuda.shared.array(shape=(ThreadsPerBlock),dtype=float32)
    suma=0.0
   
    i=tx
    while i < N:
        suma+=A[i]
        i +=ThreadsPerBlock    
            
    suma_shared[tx] = suma
    cuda.syncthreads()    

    i = int(ThreadsPerBlock/2)     
    
   
    while i > 0:
        
        if(tx < i):
            suma_shared[tx] += suma_shared[tx+i]            
        cuda.syncthreads()
        
        i = int(i/2)
        
    if tx == 0:
        B[tx] = suma_shared[tx]


#--------------------------------main------------------------------------------
blocksPerGrid=1
ThreadsPerBlock=1024# must be odd number otherwise it does not work
# also must be a multiply of warp size to work properly
N=int(blocksPerGrid*ThreadsPerBlock)


A = np.random.rand((N))
#A=np.linspace(0,N,N, endpoint=False)
print(np.sum(A))
print(A)
print("---------------")
#-----------------------------------------------------------------------------
suma=0
suma = sectiSekvenci1(A,suma)
print(suma)
#-----------------------------------------------------------------------------

#-----------------------------------------------------------------------------
B = np.zeros((N))
Ad = cuda.to_device(A)
Bd = cuda.to_device(B)
if(blocksPerGrid*ThreadsPerBlock == N):
    sumArrCUDA[blocksPerGrid,ThreadsPerBlock](Ad,Bd)
    B = Bd.copy_to_host()
    print(B[0])
Ad = cuda.to_device(A)
#-----------------------------------------------------------------------------

#-----------------------------------------------------------------------------
print(sum_reduce(Ad))
#-----------------------------------------------------------------------------
