from __future__ import annotations
import abc
import collections
import functools
import hashlib
import pathlib
import warnings
import numpy as np
from ..common import h5py
from .mapped import MappedHDF5Dataset
[docs]
class EmptyDatasetWarning(UserWarning):
"""Used for files that contain no actual data"""
pass
[docs]
class BaseImageChunkCache(abc.ABC):
def __init__(self,
shape: tuple[int],
chunk_size: int = 1000,
cache_size: int = 2,
):
self.shape = shape
self._dtype = None
chunk_size = min(shape[0], chunk_size)
self._len = self.shape[0]
self.cache = collections.OrderedDict()
"""This is a FILO cache for the chunks"""
self.image_shape = self.shape[1:]
self.chunk_shape = (chunk_size,) + self.shape[1:]
self.chunk_size = chunk_size
self.cache_size = cache_size
self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
[docs]
def __getitem__(self, index):
if isinstance(index, (slice, list, np.ndarray)):
if isinstance(index, slice):
indices = np.arange(
index.start or 0,
min(index.stop, len(self)) if index.stop else len(self),
index.step)
else:
indices = index
array_out = np.empty((len(indices),) + self.image_shape,
dtype=self.dtype)
for ii, idx in enumerate(indices):
array_out[ii] = self[idx]
return array_out
else:
chunk_index, sub_index = self._get_chunk_index_for_index(index)
return self.get_chunk(chunk_index)[sub_index]
[docs]
def __len__(self):
return self._len
@property
def dtype(self):
"""data type of the image data"""
if self._dtype is None:
self._dtype = self[0].dtype
return self._dtype
[docs]
@abc.abstractmethod
def _get_chunk_data(self, chunk_slice):
"""Implemented in subclass to obtain actual data"""
[docs]
def _get_chunk_index_for_index(self, index):
if index < 0:
index = self._len + index
elif index >= self._len:
raise IndexError(
f"Index {index} out of bounds for HDF5ImageCache "
f"of size {self._len}")
index = int(index) # convert np.uint64 to int, so we get ints below
chunk_index = index // self.chunk_size
sub_index = index % self.chunk_size
return chunk_index, sub_index
[docs]
def get_chunk(self, chunk_index):
"""Return one chunk of images"""
if chunk_index not in self.cache:
if len(self.cache) >= self.cache_size:
# Remove the first item
self.cache.popitem(last=False)
data = self._get_chunk_data(self.get_chunk_slice(chunk_index))
self.cache[chunk_index] = data
return self.cache[chunk_index]
[docs]
def get_chunk_size(self, chunk_index):
"""Return the number of images in this chunk"""
if chunk_index < self.num_chunks - 1:
return self.chunk_size
else:
chunk_size = self._len - chunk_index*self.chunk_size
if chunk_size < 0:
raise IndexError(f"{self} only has {self.num_chunks} chunks!")
return chunk_size
[docs]
def get_chunk_slice(self, chunk_index):
"""Return the slice corresponding to the chunk index"""
ch_slice = slice(self.chunk_size * chunk_index,
self.chunk_size * (chunk_index + 1)
)
return ch_slice
[docs]
def iter_chunks(self):
index = 0
chunk = 0
while True:
yield chunk
chunk += 1
index += self.chunk_size
if index >= self._len:
break
[docs]
class HDF5ImageCache(BaseImageChunkCache):
def __init__(self,
h5ds: h5py.Dataset | MappedHDF5Dataset, # type: ignore
chunk_size: int = 1000,
cache_size: int = 2,
boolean: bool = False):
"""An HDF5 image cache
Deformability cytometry data files commonly contain image stacks
that are chunked in various ways. Loading just a single image
can be time-consuming, because an entire HDF5 chunk has to be
loaded, decompressed and from that one image extracted. The
`HDF5ImageCache` class caches the chunks from the HDF5 files
into memory, making single-image-access very fast.
"""
super(HDF5ImageCache, self).__init__(
shape=h5ds.shape,
chunk_size=chunk_size,
cache_size=cache_size)
self.h5ds = h5ds
self.boolean = boolean
if self._len == 0:
warnings.warn(f"Input image '{h5ds.name}' in "
f"file {h5ds.file.filename} has zero length",
EmptyDatasetWarning)
[docs]
def _get_chunk_data(self, chunk_slice):
data = self.h5ds[chunk_slice]
if self.boolean:
data = np.asarray(data, dtype=bool)
return data
[docs]
class ImageCorrCache(BaseImageChunkCache):
def __init__(self,
image: HDF5ImageCache,
image_bg: HDF5ImageCache):
super(ImageCorrCache, self).__init__(
shape=image.shape,
chunk_size=image.chunk_size,
cache_size=image.cache_size)
self.image = image
self.image_bg = image_bg
[docs]
def _get_chunk_data(self, chunk_slice):
data = np.asarray(
self.image._get_chunk_data(chunk_slice), dtype=np.int16) \
- self.image_bg._get_chunk_data(chunk_slice)
return data
[docs]
@functools.cache
def md5sum(path, blocksize=65536, count=0):
"""Compute (partial) MD5 sum of a file
Parameters
----------
path: str or pathlib.Path
path to the file
blocksize: int
block size in bytes read from the file
(set to `0` to hash the entire file)
count: int
number of blocks read from the file
"""
path = pathlib.Path(path)
hasher = hashlib.md5()
with path.open('rb') as fd:
ii = 0
while len(buf := fd.read(blocksize)) > 0:
hasher.update(buf)
ii += 1
if count and ii == count:
break
return hasher.hexdigest()