# -*- coding: utf-8 -*-
"""Kopie sešitu Cuda.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12QQy-u51lLVNeHaODBIg_vIZphupQpRW
"""

from numba import cuda
from numba.types import float64, float32
import numba
import numpy as np
import time
import cmath

"""# Dot product"""
@cuda.reduce
def SumaDOT(A,B):
    return A + B

"""Cuda implementation"""

@cuda.jit
def dot_Secti(a,b):
    tx = cuda.threadIdx.x
    n = a.shape[0]
    

    suma_shared = cuda.shared.array(shape=(threads),dtype=float32)
    suma=0.0
   
    i=tx
    while i < n:
        suma+=a[i]
        i +=threads    
            
    suma_shared[tx] = suma
    cuda.syncthreads()    

    i = int(threads/2)     
       
    while i > 0:
        
        if(tx < i):
            suma_shared[tx] += suma_shared[tx+i]            
        cuda.syncthreads()
        
        i = int(i/2)
    
    
    if tx == 0:
        b[tx] = suma_shared[tx]
        


@cuda.jit
def dot_product(a, b):
    tx = cuda.threadIdx.x
    bdim = cuda.blockDim.x
    bx = cuda.blockIdx.x
    
    n = a.shape[0]

    # Get index
    x = tx + bx * bdim

    # If OOB return
    if x < a.shape[0]:
        a[x] = a[x] * b[x]        
    # Start sum
    cuda.syncthreads()    

threads = 1024
blocks = 100


dim = threads * blocks
a = np.random.randint(1,5,dim)
b = np.random.randint(1,5,dim)
c = np.zeros(1)

n = a.shape[0]

mystr= cuda.stream()
ag = cuda.to_device(a, stream=mystr)
bg = cuda.to_device(b,stream=mystr)
cg = cuda.to_device(c,stream=mystr)


t1 = time.time()

dot_product[blocks, threads](ag, bg)
res = SumaDOT(ag)

dot_Secti[blocks, threads](ag,cg)

c_res = cg.copy_to_host(stream=mystr)
print(c_res)

#print(a)
#print(b)


cuda_time = time.time() - t1


#print(np.sum(a_res))



print(f"CUDA Res: {a_res[0]}, took: {cuda_time}")

t1 = time.time()
#np_dot = a@b
np_dot = np.dot(a, b)
np_time = time.time() - t1

print(f"Numpy Res: {np_dot}, took: {np_time}")

"""Sequential"""



"""# Integration: Trapezoidal Rule

$$\int_a^b f(x) dx ≈ Δx\left(\frac{f(x_0)+f(x_n)}{2}+∑_{i=1}^{n-1}f(x_i)\right)$$
"""

def integrate(a, b, blocks, threads, function):
    dim = threads * blocks
    dx = (b - a) / dim
    xs = np.linspace(a, b, dim)
    ys = function(xs)

    g_ys = cuda.to_device(ys)
    trapezoid_integral[blocks, threads](g_ys, dx)
    res = g_ys.copy_to_host()
    return res[0]


@cuda.jit
def trapezoid_integral(ys, dx):
    tx = cuda.threadIdx.x
    bdim = cuda.blockDim.x
    bx = cuda.blockIdx.x

    x = tx + bx * bdim
    n = ys.shape[0]

    if x >= n:
        return

    ys[x] *= dx

    if x == 0 or x == n - 1:
        ys[x] /= 2

    cuda.syncthreads()
    stride = 1
    while stride < n:
        if x % (2 * stride) == 0 and x + stride < n:
            ys[x] += ys[x + stride]
        stride *= 2
        cuda.syncthreads()

def func(x):
    return 1/(np.sqrt(x) + 1)

threads = 100
blocks = 1

a = 0
b = 10
f = np.vectorize(func)
t1 = time.time()
res = integrate(a, b, blocks, threads, func)
cuda_time = time.time() - t1

print(f"Res: {res}, took: {cuda_time}")
