# -*- coding: utf-8 -*-
"""
Created on Tue May 14 12:38:58 2024

@author: zposel
"""

import time
import numpy as np
from numba import cuda


@cuda.jit
def SoucetMaticNormal(A,B,C):
    i,j = cuda.grid(2) # vytvori reference na vlakna sam
    
    if (i < A.shape[0] and j < A.shape[1]):
        C[i,j] = A[i,j]*B[i,j]    
            

N = int(1e3)

Nb = int(100)
Nth = int(10)

blocksPerGrid=(Nb,Nb)
ThreadsPerBlock=(Nth,Nth)

A = np.random.randint(low=1,high=5,size=(N,N))
B = np.random.randint(low=1,high=5,size=(N,N))
C = np.zeros((N,N))

tic=time.time()
#C = np.dot(A,B)
C = A*B
toc=time.time()
print('elapsed time vectorize',toc-tic)
print(np.sum(C))

C = np.zeros((N,N))

stream = cuda.stream()
Ad = cuda.to_device(A, stream=stream)
Bd = cuda.to_device(B, stream=stream)
Cd = cuda.to_device(C, stream=stream)

tic=time.time()
SoucetMaticNormal[blocksPerGrid,ThreadsPerBlock](Ad,Bd,Cd)
toc=time.time()

C = Cd.copy_to_host(stream=stream)
print(np.sum(C))
stream.synchronize()

print('elapsed time CUDA-noMemory',toc-tic)


