mean()
of a matrix by vectorizing across colums and parallelizing across rowsfrom tensor import Tensor, TensorShape, TensorSpec
from math import trunc, mod
from memory import memset_zero
from sys.info import simdwidthof, simdbitwidth
from algorithm import vectorize, parallelize, vectorize_unroll
from utils.index import Index
from random import rand, seed
from python import Python
import time
alias dtype = DType.float32
alias simd_width: Int = simdwidthof[dtype]()
print("SIMD bit width",simdbitwidth())
print("SIMD Width",simd_width)
Create an small Tensor
and visualize the shape of the inputs and outputs.
For this small input matrix is 5x12
and the output matrix with means()
should be 5x1
let tx = rand[dtype](5,12)
tensorprint(tx)
# Note: This function will give you an error.
# Run the last cell in the notebook that defines `tensorprint` and then
# come back and run this cell. The `tensorprint` is temporary helper function
# untill we have native print support for tensors in the next release
Create a 1000x100000
matrix to make it more computationally intensive.
seed(42)
let t = rand[dtype](1000,100000)
var result = Tensor[dtype](t.dim(0),1)
print("Input Matrix shape:",t.shape().__str__())
print("Reduced Matrix shape",result.shape().__str__())
Write a function to calculate averages of each row the naive way
fn tensor_mean[dtype: DType](t: Tensor[dtype]) -> Tensor[dtype]:
var new_tensor = Tensor[dtype](t.dim(0),1)
for i in range(t.dim(0)):
for j in range(t.dim(1)):
new_tensor[i] += t[i,j]
new_tensor[i] /= t.dim(1)
return new_tensor
Vectorized and parallelized approach
fn tensor_mean_vectorize_parallelized[dtype: DType](t: Tensor[dtype]) -> Tensor[dtype]:
var new_tensor = Tensor[dtype](t.dim(0),1)
@parameter
fn parallel_reduce_rows(idx1: Int)->None:
@parameter
fn vectorize_reduce_row[simd_width: Int](idx2: Int) -> None:
new_tensor[idx1] += t.simd_load[simd_width](idx1*t.dim(1)+idx2).reduce_add()
vectorize[2*simd_width,vectorize_reduce_row](t.dim(1))
new_tensor[idx1] /= t.dim(1)
parallelize[parallel_reduce_rows](t.dim(0),8)
return new_tensor
# Naive approach in Mojo
alias reps = 10
var tm1 = time.now()
for i in range(reps):
_ = tensor_mean[dtype](t)
let dur1 = time.now()-tm1
print("Mojo naive mean:",dur1/reps/1000000,"ms")
# NumPy approach
let np = Python.import_module("numpy")
let dim0 = t.dim(0)
let dim1 = t.dim(1)
let t_np = np.random.rand(dim0,dim1).astype(np.float32)
var tm2 = time.now()
for i in range(reps):
_ = np.mean(t_np,1)
let dur2 = time.now()-tm2
print("Numpy mean:",dur2/reps/1000000,"ms")
# Vectorized and parallelized approach in Mojo
var tm3 = time.now()
for i in range(reps):
_ = tensor_mean_vectorize_parallelized[dtype](t)
let dur3 = time.now()-tm3
print("Mojo Vectorized and parallelized mean:",dur3/reps/1000000,"ms")
from tensor import Tensor, TensorShape, TensorSpec
from math import trunc, mod
fn tensorprint[type: DType](t: Tensor[type])->None:
let rank = t.rank()
var dim0:Int=0
var dim1:Int=0
var dim2:Int=0
if rank==0 or rank>3:
print("Error: Tensor rank should be: 1,2, or 3. Tensor rank is ", rank)
return
if rank==1:
dim0 = 1
dim1 = 1
dim2 = t.dim(0)
if rank==2:
dim0 = 1
dim1 = t.dim(0)
dim2 = t.dim(1)
if rank==3:
dim0 = t.dim(0)
dim1 = t.dim(1)
dim2 = t.dim(2)
var val:SIMD[type, 1]=0.0
for i in range(dim0):
if i==0 and rank==3:
print("[")
else:
if i>0:
print()
for j in range(dim1):
if rank!=1:
if j==0:
print_no_newline(" [")
else:
print_no_newline("\n ")
print_no_newline("[")
for k in range(dim2):
if rank==1:
val = t[k]
if rank==2:
val = t[j,k]
if rank==3:
val = t[i,j,k]
let int_str: String
if val > 0 or val == 0:
int_str = String(trunc(val).cast[DType.int32]())
else:
val = -val
int_str = "-"+String(trunc(val).cast[DType.int32]())
let float_str = String(mod(val,1))
let s = int_str+"."+float_str[2:6]
if k==0:
print_no_newline(s)
else:
print_no_newline(" ",s)
print_no_newline("]")
if rank>1:
print_no_newline("]")
print()
if rank==3:
print("]")
print("Tensor shape:",t.shape().__str__(),", Tensor rank:",rank,",","DType:", type.__str__())