Vectorization and Parallelization in Mojo🔥

Speeding up Python in Mojo🔥 using vectorization and parallelization

Example: Calculate row-wise `mean()` of a matrix by vectorizing across colums and parallelizing across rows

from tensor import Tensor, TensorShape, TensorSpec
from math import trunc, mod
from memory import memset_zero
from sys.info import simdwidthof, simdbitwidth
from algorithm import vectorize, parallelize, vectorize_unroll
from utils.index import Index
from random import rand, seed
from python import Python
import time

alias dtype = DType.float32
alias simd_width: Int = simdwidthof[dtype]()
print("SIMD bit width",simdbitwidth())
print("SIMD Width",simd_width)

Create an small Tensor and visualize the shape of the inputs and outputs. For this small input matrix is 5x12 and the output matrix with means() should be 5x1

let tx = rand[dtype](5,12)
tensorprint(tx) 

# Note: This function will give you an error. 
# Run the last cell in the notebook that defines `tensorprint` and then 
# come back and run this cell. The `tensorprint` is temporary helper function 
# untill we have native print support for tensors in the next release

Create a 1000x100000 matrix to make it more computationally intensive.

seed(42)
let t = rand[dtype](1000,100000)
var result = Tensor[dtype](t.dim(0),1)

print("Input Matrix shape:",t.shape().__str__())
print("Reduced Matrix shape",result.shape().__str__())

Write a function to calculate averages of each row the naive way

fn tensor_mean[dtype: DType](t: Tensor[dtype]) -> Tensor[dtype]:
    var new_tensor = Tensor[dtype](t.dim(0),1)
    for i in range(t.dim(0)):
        for j in range(t.dim(1)):
            new_tensor[i] += t[i,j]
        new_tensor[i] /= t.dim(1)
    return new_tensor

Vectorized and parallelized approach

fn tensor_mean_vectorize_parallelized[dtype: DType](t: Tensor[dtype]) -> Tensor[dtype]:
    var new_tensor = Tensor[dtype](t.dim(0),1)
    @parameter
    fn parallel_reduce_rows(idx1: Int)->None:
        @parameter
        fn vectorize_reduce_row[simd_width: Int](idx2: Int) -> None:
            new_tensor[idx1] += t.simd_load[simd_width](idx1*t.dim(1)+idx2).reduce_add()
        vectorize[2*simd_width,vectorize_reduce_row](t.dim(1))
        new_tensor[idx1] /= t.dim(1)
    parallelize[parallel_reduce_rows](t.dim(0),8)
    return new_tensor

# Naive approach in Mojo
alias reps = 10
var tm1 = time.now()
for i in range(reps):
    _ = tensor_mean[dtype](t)
let dur1 = time.now()-tm1
print("Mojo naive mean:",dur1/reps/1000000,"ms")

# NumPy approach 
let np = Python.import_module("numpy")
let dim0 = t.dim(0)
let dim1 = t.dim(1)
let t_np = np.random.rand(dim0,dim1).astype(np.float32)
var tm2 = time.now()
for i in range(reps):
    _ = np.mean(t_np,1)
let dur2 = time.now()-tm2
print("Numpy mean:",dur2/reps/1000000,"ms")

# Vectorized and parallelized approach in Mojo
var tm3 = time.now()
for i in range(reps):
    _ = tensor_mean_vectorize_parallelized[dtype](t)
let dur3 = time.now()-tm3
print("Mojo Vectorized and parallelized mean:",dur3/reps/1000000,"ms")

from tensor import Tensor, TensorShape, TensorSpec
from math import trunc, mod
fn tensorprint[type: DType](t: Tensor[type])->None:
    let rank = t.rank()
    var dim0:Int=0
    var dim1:Int=0
    var dim2:Int=0
    if rank==0 or rank>3:
        print("Error: Tensor rank should be: 1,2, or 3. Tensor rank is ", rank)
        return
    if rank==1:
        dim0 = 1
        dim1 = 1
        dim2 = t.dim(0)
    if rank==2:
        dim0 = 1
        dim1 = t.dim(0)
        dim2 = t.dim(1)
    if rank==3:
        dim0 = t.dim(0)
        dim1 = t.dim(1)
        dim2 = t.dim(2)
    var val:SIMD[type, 1]=0.0
    for i in range(dim0):
        if i==0 and rank==3:
            print("[")
        else:
            if i>0:
                print()
        for j in range(dim1):
            if rank!=1:
                if j==0:
                    print_no_newline("  [")
                else:
                    print_no_newline("\n   ")
            print_no_newline("[")
            for k in range(dim2):
                if rank==1:
                    val = t[k]
                if rank==2:
                    val = t[j,k]
                if rank==3:
                    val = t[i,j,k]
                let int_str: String
                if val > 0 or val == 0:
                    int_str = String(trunc(val).cast[DType.int32]())
                else:
                    val = -val
                    int_str = "-"+String(trunc(val).cast[DType.int32]())
                let float_str = String(mod(val,1))
                let s = int_str+"."+float_str[2:6]
                if k==0:
                    print_no_newline(s)
                else:
                    print_no_newline("  ",s)
            print_no_newline("]")
        if rank>1:
            print_no_newline("]")
        print()
    if rank==3:
        print("]")
    print("Tensor shape:",t.shape().__str__(),", Tensor rank:",rank,",","DType:", type.__str__())

Vectorization and Parallelization in Mojo🔥

Speeding up Python in Mojo🔥 using vectorization and parallelization

Example: Calculate row-wise mean() of a matrix by vectorizing across colums and parallelizing across rows

Example: Calculate row-wise `mean()` of a matrix by vectorizing across colums and parallelizing across rows