Source code for dcnum.read.cache

from __future__ import annotations

import abc
import collections
import functools
import hashlib
import pathlib
import warnings

import numpy as np

from ..common import h5py

from .mapped import MappedHDF5Dataset


[docs] class EmptyDatasetWarning(UserWarning): """Used for files that contain no actual data""" pass
[docs] class BaseImageChunkCache(abc.ABC): def __init__(self, shape: tuple[int], chunk_size: int = 1000, cache_size: int = 2, ): self.shape = shape self._dtype = None chunk_size = min(shape[0], chunk_size) self._len = self.shape[0] self.cache = collections.OrderedDict() """This is a FILO cache for the chunks""" self.image_shape = self.shape[1:] self.chunk_shape = (chunk_size,) + self.shape[1:] self.chunk_size = chunk_size self.cache_size = cache_size self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
[docs] def __getitem__(self, index): if isinstance(index, (slice, list, np.ndarray)): if isinstance(index, slice): indices = np.arange( index.start or 0, min(index.stop, len(self)) if index.stop else len(self), index.step) else: indices = index array_out = np.empty((len(indices),) + self.image_shape, dtype=self.dtype) for ii, idx in enumerate(indices): array_out[ii] = self[idx] return array_out else: chunk_index, sub_index = self._get_chunk_index_for_index(index) return self.get_chunk(chunk_index)[sub_index]
[docs] def __len__(self): return self._len
@property def dtype(self): """data type of the image data""" if self._dtype is None: self._dtype = self[0].dtype return self._dtype
[docs] @abc.abstractmethod def _get_chunk_data(self, chunk_slice): """Implemented in subclass to obtain actual data"""
[docs] def _get_chunk_index_for_index(self, index): if index < 0: index = self._len + index elif index >= self._len: raise IndexError( f"Index {index} out of bounds for HDF5ImageCache " f"of size {self._len}") index = int(index) # convert np.uint64 to int, so we get ints below chunk_index = index // self.chunk_size sub_index = index % self.chunk_size return chunk_index, sub_index
[docs] def get_chunk(self, chunk_index): """Return one chunk of images""" if chunk_index not in self.cache: if len(self.cache) >= self.cache_size: # Remove the first item self.cache.popitem(last=False) data = self._get_chunk_data(self.get_chunk_slice(chunk_index)) self.cache[chunk_index] = data return self.cache[chunk_index]
[docs] def get_chunk_size(self, chunk_index): """Return the number of images in this chunk""" if chunk_index < self.num_chunks - 1: return self.chunk_size else: chunk_size = self._len - chunk_index*self.chunk_size if chunk_size < 0: raise IndexError(f"{self} only has {self.num_chunks} chunks!") return chunk_size
[docs] def get_chunk_slice(self, chunk_index): """Return the slice corresponding to the chunk index""" ch_slice = slice(self.chunk_size * chunk_index, self.chunk_size * (chunk_index + 1) ) return ch_slice
[docs] def iter_chunks(self): index = 0 chunk = 0 while True: yield chunk chunk += 1 index += self.chunk_size if index >= self._len: break
[docs] class HDF5ImageCache(BaseImageChunkCache): def __init__(self, h5ds: h5py.Dataset | MappedHDF5Dataset, # type: ignore chunk_size: int = 1000, cache_size: int = 2, boolean: bool = False): """An HDF5 image cache Deformability cytometry data files commonly contain image stacks that are chunked in various ways. Loading just a single image can be time-consuming, because an entire HDF5 chunk has to be loaded, decompressed and from that one image extracted. The `HDF5ImageCache` class caches the chunks from the HDF5 files into memory, making single-image-access very fast. """ super(HDF5ImageCache, self).__init__( shape=h5ds.shape, chunk_size=chunk_size, cache_size=cache_size) self.h5ds = h5ds self.boolean = boolean if self._len == 0: warnings.warn(f"Input image '{h5ds.name}' in " f"file {h5ds.file.filename} has zero length", EmptyDatasetWarning)
[docs] def _get_chunk_data(self, chunk_slice): data = self.h5ds[chunk_slice] if self.boolean: data = np.asarray(data, dtype=bool) return data
[docs] class ImageCorrCache(BaseImageChunkCache): def __init__(self, image: HDF5ImageCache, image_bg: HDF5ImageCache): super(ImageCorrCache, self).__init__( shape=image.shape, chunk_size=image.chunk_size, cache_size=image.cache_size) self.image = image self.image_bg = image_bg
[docs] def _get_chunk_data(self, chunk_slice): data = np.asarray( self.image._get_chunk_data(chunk_slice), dtype=np.int16) \ - self.image_bg._get_chunk_data(chunk_slice) return data
[docs] @functools.cache def md5sum(path, blocksize=65536, count=0): """Compute (partial) MD5 sum of a file Parameters ---------- path: str or pathlib.Path path to the file blocksize: int block size in bytes read from the file (set to `0` to hash the entire file) count: int number of blocks read from the file """ path = pathlib.Path(path) hasher = hashlib.md5() with path.open('rb') as fd: ii = 0 while len(buf := fd.read(blocksize)) > 0: hasher.update(buf) ii += 1 if count and ii == count: break return hasher.hexdigest()