import numpy as np
from matplotlib import pyplot as plt
from numba import cuda, float32, float64


def mojeF(x):
# x is an array    
    y = x*x    
    return y

def IntegrujTrapz(x,y,h):
    N = x.size
    
    suma = 0.
    suma = y.sum() - y[-1] - y[0]
    Ival =  h*suma + h*(y[0]+y[-1])/2.            
    return Ival

@cuda.reduce
def SumaTrapz(A,B):
    return A + B

@cuda.jit
def CudaTrapz(y,Ival):
    N = int(y.size)
    #tx=cuda.grid(1)
    tx = cuda.threadIdx.x
    suma_shared = cuda.shared.array(shape=(ThreadsPerBlock),dtype=float32)
    suma=0.0    
    i = int(tx)

    suma=0.
    while i < N: 
        suma+=y[i]
        i +=int(ThreadsPerBlock)        
    suma_shared[tx] = suma
    cuda.syncthreads()

    i = int(ThreadsPerBlock/2)    
    while i >0:
        if(int(tx) < i):
            suma_shared[tx] += suma_shared[tx+i]            
        cuda.syncthreads()        
        i = int(i/2)    
        
    if tx == 0:
        Ival[tx] = suma_shared[tx]


blocksPerGrid=16
ThreadsPerBlock = 64
    
xmin=0
xmax = 1
bins = blocksPerGrid*ThreadsPerBlock
x = np.linspace(xmin,xmax,bins)
y = mojeF(x)
h = (x[-1] - x[0])/x.size

plt.plot(x,y)
plt.show()

Ival = IntegrujTrapz(x,y,h)


IntValCuda=np.zeros((y.size))

stream = cuda.stream()
y_d = cuda.to_device(y,stream=stream)
Int_d = cuda.to_device(IntValCuda,stream=stream)
Itval_CudaR = h*(SumaTrapz(y_d)-y[-1]-y[0]) + h*(y[0]+y[-1])/2.


if(blocksPerGrid*ThreadsPerBlock == y.size):
    
    CudaTrapz[blocksPerGrid,ThreadsPerBlock](y_d,Int_d)
    IntValCuda = Int_d.copy_to_host(stream=stream)
    print('done')
    IntValCuda[0] = h*(IntValCuda[0]-y[-1]-y[0]) + h*(y[0]+y[-1])/2.

print(Ival)
print(Itval_CudaR)
print(IntValCuda[0])