Matrix multiplication#

In this benchmark we will compare the performance of ironArray, NumPy, Zarr+Dask, HDF5+Dask and TileDB+Dask when calculating a matrix product. We will first set up the optimized arrays from the corresponding Matrix Multiplication tutorial, and then compute the matrix multiplication. It is important to stress that we are using defaults for every library, which are typically fine tuned for general performance; this is a pretty fair way to compare them without spending long time optimizing for every one.

We will first set up the arrays with the same chunk shape, and then compute the matrix multiplication.

[1]:
import iarray as ia
import numpy as np
import os
import tiledb
import zarr
import dask.array as da
import h5py
import hdf5plugin as h5plugin
[2]:
%load_ext memprofiler

ironArray#

[3]:
%%mprof_run
ia.set_config_defaults(dtype=np.float64)
nrows = 100_000 # number of rows in matrix am
ncols = 25000   # number of columns in first matrix
ncols2 = 1000   # number of columns in second matrix

shape = (nrows, ncols, ncols2)
amshape = (shape[0], shape[1])
bmshape = (shape[1], shape[2])
# Obtain optimal chunk and block shapes
mparams = ia.matmul_params(amshape, bmshape, itemsize=8)
amchunks, amblocks, bmchunks, bmblocks = mparams

filename = "../tutorials/arr-gemm.iarr"
if os.path.exists(filename):
    am = ia.open(filename)
else:
    # btune does not represent an advantage when the number of chunks is small
    ia.set_config_defaults(btune=False)
    am = ia.random.normal(amshape, 3, 2, chunks=amchunks, blocks=amblocks, urlpath=filename, fp_mantissa_bits=20)
print(am.info)

w = np.ones(bmshape)
bm = ia.numpy2iarray(w, chunks=bmchunks, blocks=bmblocks)
print(bm.info)
type   : IArray
shape  : (100000, 25000)
chunks : (4116, 4116)
blocks : (147, 147)
cratio : 2.85

type   : IArray
shape  : (25000, 1000)
chunks : (4116, 1029)
blocks : (147, 147)
cratio : 260.05

memprofiler: used 199.11 MiB RAM (peak of 230.76 MiB) in 0.0790 s, total RAM usage 418.90 MiB

Before proceeding, let’s evict the just created file from the OS page cache, so as to better assess an actual out of core evaluation from disk.

[4]:
!vmtouch -e $filename
           Files: 1
     Directories: 0
   Evicted Pages: 2035023 (7G)
         Elapsed: 0.29202 seconds
[5]:
%%mprof_run 1.iarray::matmul
iacm_opt = ia.matmul(am, bm)
print(iacm_opt.info)
type   : IArray
shape  : (100000, 1000)
chunks : (4116, 1029)
blocks : (147, 147)
cratio : 37.14

memprofiler: used 50.64 MiB RAM (peak of 98.52 MiB) in 11.0587 s, total RAM usage 469.71 MiB

Zarr#

[6]:
zfilename = "arr-gemm.zarr"
if not os.path.exists(zfilename):
    # Matrix zam goes to disk
    zam = zarr.empty(store=zfilename, shape=amshape)
    am.copyto(zam)
zam = zarr.open(zfilename)
print(zam.info)
dask_a = da.from_zarr(zam)

zbm = zarr.create(shape=bmshape)
bm.copyto(zbm)
print(zbm.info)
dask_b = da.from_zarr(zbm)
Type               : zarr.core.Array
Data type          : float64
Shape              : (100000, 25000)
Chunk shape        : (1563, 391)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : zarr.storage.DirectoryStore
No. bytes          : 20000000000 (18.6G)
No. bytes stored   : 8758424321 (8.2G)
Storage ratio      : 2.3
Chunks initialized : 4096/4096

Type               : zarr.core.Array
Data type          : float64
Shape              : (25000, 1000)
Chunk shape        : (1563, 125)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : builtins.dict
No. bytes          : 200000000 (190.7M)
No. bytes stored   : 805393 (786.5K)
Storage ratio      : 248.3
Chunks initialized : 128/128

[7]:
!vmtouch -e $zfilename
           Files: 4097
     Directories: 1
   Evicted Pages: 2140153 (8G)
         Elapsed: 0.009813 seconds
[8]:
%%mprof_run 2.zarr::matmul
res = da.matmul(dask_a, dask_b)
cshape = (zam.shape[0], zbm.shape[1])
c = zarr.create(shape=cshape)
da.to_zarr(res, c)
print(c.info)
Type               : zarr.core.Array
Data type          : float64
Shape              : (100000, 1000)
Chunk shape        : (3125, 63)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : builtins.dict
No. bytes          : 800000000 (762.9M)
No. bytes stored   : 41768801 (39.8M)
Storage ratio      : 19.2
Chunks initialized : 512/512

memprofiler: used 9104.75 MiB RAM (peak of 23066.00 MiB) in 155.5037 s, total RAM usage 9583.20 MiB
[9]:
!vmtouch -e $zfilename
del res
del dask_a
del zam
del dask_b
del zbm
del c
del w
           Files: 4097
     Directories: 1
   Evicted Pages: 2140153 (8G)
         Elapsed: 0.27038 seconds

HDF5#

[10]:
ah5_urlpath = "arr-gemm.hdf5"
if not os.path.exists(ah5_urlpath):
    with h5py.File(ah5_urlpath, "w") as f:
        ah5 = f.create_dataset("arr-gemm", am.shape, dtype=am.dtype, **h5plugin.Blosc())
        am.copyto(ah5)

h5_file = h5py.File(ah5_urlpath, "r")
a_h5_disk = h5_file['arr-gemm']
a_h5dask = da.from_array(a_h5_disk)

bh5_urlpath = "barr-gemm.hdf5"
if not os.path.exists(bh5_urlpath):
    with h5py.File(bh5_urlpath, "w") as f:
        bh5 = f.create_dataset("barr-gemm", bm.shape, dtype=bm.dtype, **h5plugin.Blosc())
        bm.copyto(bh5)

h5_bfile = h5py.File(bh5_urlpath, "r", driver='core', backing_store=False)
b_h5 = h5_bfile['barr-gemm']
b_h5dask = da.from_array(b_h5)
[11]:
!vmtouch -e $ah5_urlpath
           Files: 1
     Directories: 0
   Evicted Pages: 2196074 (8G)
         Elapsed: 1.1e-05 seconds
[12]:
%%mprof_run 3.hdf5::matmul
res_h5dask = da.matmul(a_h5dask, b_h5dask)

f = h5py.File("matmul.hdf5", "w", driver='core', backing_store=False)
h5_matmul = f.create_dataset(name="matmul.hdf5", shape=iacm_opt.shape, dtype=iacm_opt.dtype, **h5plugin.Blosc())
da.to_hdf5("matmul.hdf5", '/x', res_h5dask)
memprofiler: used 554.93 MiB RAM (peak of 5149.72 MiB) in 38.6218 s, total RAM usage 9836.55 MiB
[13]:
!vmtouch -e $ah5_urlpath
del res_h5dask
del a_h5dask
del b_h5dask
del b_h5
del a_h5_disk
del h5_matmul
           Files: 1
     Directories: 0
   Evicted Pages: 2196074 (8G)
         Elapsed: 0.29901 seconds

TileDB#

[14]:
adom = tiledb.Domain(
        tiledb.Dim(name="rows", domain=(0, amshape[0] - 1), dtype=np.int32, tile=amchunks[0] if amchunks[0]%amshape[0] == 0 else amchunks[0]%amshape[0]),
        tiledb.Dim(name="cols", domain=(0, amshape[1] - 1), dtype=np.int32, tile=amchunks[1] if amchunks[1]%amshape[1] == 0 else amchunks[1]%amshape[1]),
    )

# The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.
filters = tiledb.FilterList([tiledb.ByteShuffleFilter(), tiledb.LZ4Filter(5)])
# The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.
schema = tiledb.ArraySchema(
    domain=adom,  sparse=False, attrs=[tiledb.Attr(name="a", dtype=np.float64, filters=filters)]
)

# Create the (empty) array on disk.
array_name = "tiledb_a"

if not os.path.exists(array_name):
    tiledb.DenseArray.create(array_name, schema)
    with tiledb.DenseArray(array_name, mode="w") as A:
        A[:] = am.data

tilea = da.from_tiledb(array_name, attribute='a')

bdom = tiledb.Domain(
        tiledb.Dim(name="rows", domain=(0, bmshape[0] - 1), dtype=np.int32, tile=bmchunks[0]%bmshape[0]),
        tiledb.Dim(name="cols", domain=(0, bmshape[1] - 1), dtype=np.int32, tile=bmchunks[1]%bmshape[1]),
)

# The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.
schemab = tiledb.ArraySchema(
    domain=bdom, sparse=False, attrs=[tiledb.Attr(name="b", dtype=np.float64, filters=filters)]
)

# Create the (empty) array in memory
array_nameb = "mem://tiledb_b"
tiledb.DenseArray.create(array_nameb, schemab)
with tiledb.DenseArray(array_nameb, mode="w") as B:
    B[:] = bm.data

tileb = da.from_tiledb(array_nameb, attribute='b')
[15]:
!vmtouch -e $array_name
           Files: 5
     Directories: 4
   Evicted Pages: 2212260 (8G)
         Elapsed: 7.9e-05 seconds
[16]:
%%mprof_run 4.tiledb::matmul
res_dask = da.matmul(tilea, tileb)
res_dask.to_tiledb("mem://res_tiledb")
/home/faltet2/miniconda3/lib/python3.9/site-packages/dask/array/routines.py:383: PerformanceWarning: Increasing number of chunks by factor of 25
  out = blockwise(
memprofiler: used 973.19 MiB RAM (peak of 4471.44 MiB) in 74.1563 s, total RAM usage 2311.17 MiB
[17]:
del res_dask
del tilea
del tileb
tiledb.remove(array_nameb)

NumPy#

[18]:
npa = am.data
npb = bm.data
[19]:
%%mprof_run 5.numpy::matmul
npcm = np.matmul(npa, npb)
memprofiler: used 886.07 MiB RAM (peak of 886.07 MiB) in 14.5768 s, total RAM usage 22414.04 MiB
[20]:
del npa
del npb
del npcm

Results#

[21]:
%mprof_plot .*::matmul -t "Matrix multiplication computation"