Source code for pycat.data.data_modules

"""
Data Module for PyCAT

This module contains classes and functions for managing and processing data within a biological
image analysis context, using napari. The primary components include BaseDataClass, which provides
basic functionalities for data handling and management, and AnalysisDataClass, which extends
BaseDataClass to cater specifically to puncta and cell data analysis.

This module is designed to be integrated with napari viewers to facilitate real-time data manipulation
and analysis, enhancing the workflow in biological research settings.

Author
------
    Christian Neureuter, GitHub: https://github.com/cneureuter

Date
----
    4-20-2024
"""

# Standard library imports
import math
import copy

# Third party imports
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from napari.utils.notifications import show_info as napari_show_info
from napari.utils.notifications import show_warning as napari_show_warning



[docs] class BaseDataClass: """ A base class for managing data related to image analysis or similar scientific data processing applications. It encapsulates operations for storing, retrieving, updating, and resetting data, especially focusing on handling pandas DataFrames for analysis results alongside other types of metadata and parameters. Attributes ---------- data_repository : dict A dictionary acting as a central repository for all data managed by instances of this class. This includes pandas DataFrames for storing analysis results, numeric parameters for analysis, and metadata. Methods ------- set_data(self, key, data): Stores or updates data in the data repository under the specified key. get_data(self, key): Retrieves data stored in the data repository under the specified key. append_to_df(self, key, data): Appends a new row of data to a DataFrame stored under the specified key in the data repository. update_df(self, key, index, column, value): Updates a specific value in a DataFrame stored under the specified key in the data repository. add_column_to_df(self, key, column_name, default_value=None): Adds a new column to a DataFrame stored under the specified key in the data repository, initializing all rows in this column to a default value. calculate_length(self, viewer): Calculates and updates the size parameters for objects and cells based on annotations made in the napari viewer. get_dataframes(self): Retrieves all pandas DataFrames stored in the data repository. get_all_variables(self): Returns a list of all keys currently stored in the data repository. reset_values(self, df_names_to_reset=None, clear_all=False): Resets specified DataFrames or all data within the class to their default initialization values. """
[docs] def __init__(self, base_data_repository=None): """ Initializes the BaseDataClass with a default data repository containing empty pandas DataFrames for storing analysis results, default analysis parameters, and an empty metadata dictionary. It can be with initialized with optional existing repository data. """ if base_data_repository: # Create a deep copy of the provided repository self.data_repository = copy.deepcopy(base_data_repository) else: self._initialize_repository()
def _initialize_repository(self): """ Initialize or reset the data repository with default values. Includes all necessary dataframes and parameters for analysis. """ self.data_repository = { # Original BaseDataClass attributes 'region_props_df': pd.DataFrame(), 'generic_df': pd.DataFrame(), 'object_size': 50, 'cell_diameter': 100, 'ball_radius': 75, 'microns_per_pixel_sq': 1, 'metadata': {}, # Former AnalysisDataClass attributes 'cell_df': pd.DataFrame(), 'puncta_df': pd.DataFrame() }
[docs] def set_data(self, key, data): """ Stores or updates the specified data under the given key within the data repository. This method is flexible and can be used to store various types of data, from numeric values to complex objects. Parameters ---------- key : str A unique identifier for the data being stored. data : object The data to be stored in the data repository under the specified key. Notes ----- This method can be used to store a wide range of data types, including pandas DataFrames, numpy arrays, dictionaries, and other objects. The key should be a string that uniquely identifies the data being stored. """ # Validation to ensure that the key exists and the data is of the correct type if self.data_repository[key].__class__ != data.__class__: napari_show_warning(f"Data type mismatch for key {key}.") # if self.data_repository[key] doesnt exist yet, create it elif key not in self.data_repository: self.data_repository[key] = data else: self.data_repository[key] = copy.deepcopy(data)
#self._notify(f"Data {key} in data class has been set!")
[docs] def get_data(self, key, default_value=None): """ Retrieves the data stored under the specified key from the data repository. If the key does not exist, `None` is returned. An optional default value can be provided to return when the key is not found. Parameters ---------- key : str The unique identifier for the data to be retrieved. default_value : object, optional The default value to return if the key does not exist in the data repository. The default is `None`. Returns ------- object The data stored in the data repository under the specified key, or the default value if the key does not exist. If no default value is provided, `None` is returned when the key is not found. """ return self.data_repository.get(key, default_value)
[docs] def append_to_df(self, key, data: dict): """ Appends a new row of data to the DataFrame identified by the given key. The new row of data should be provided as a dictionary where keys correspond to column names. Parameters ---------- key : str The key identifying the DataFrame to which the new row of data should be appended. data : dict A dictionary where keys represent column names and values represent the data to be appended. """ # Check if the key exists and is a DataFrame if key in self.data_repository and isinstance(self.data_repository[key], pd.DataFrame): df = self.data_repository[key] new_row = pd.DataFrame([data]) self.data_repository[key] = pd.concat([df, new_row], ignore_index=True) #self._notify(f"Data {key} in data class has been appended!") else: napari_show_warning(f"Key {key} does not exist or is not a DataFrame")
[docs] def update_df(self, key, index, column, value): """ Updates the value at a specified index and column in the DataFrame identified by the given key. Parameters ---------- key : str The key identifying the DataFrame to be updated. index : int The index of the row to be updated. column : str The column name where the value should be updated. value : object The new value to be set at the specified index and column. """ if key in self.data_repository and isinstance(self.data_repository[key], pd.DataFrame): self.data_repository[key].at[index, column] = value #self._notify(f"Data {key} in data class has been updated!") else: napari_show_warning(f"Key {key} does not exist or is not a DataFrame")
[docs] def add_column_to_df(self, key, column_name, default_value=None): """ Adds a new column to the DataFrame identified by the given key, initializing it with the specified default value for all rows. Parameters ---------- key : str The key identifying the DataFrame to which the new column should be added. column_name : str The name of the new column to be added. default_value : object, optional The default value to be assigned to all rows in the new column. If not provided, the default value is `None`. """ if key in self.data_repository and isinstance(self.data_repository[key], pd.DataFrame): self.data_repository[key][column_name] = default_value #self._notify(f"Column {column_name} added to {key} in data class!") else: napari_show_warning(f"Key {key} does not exist or is not a DataFrame")
# Placeholders for possible future implmentation of observer notifications in the data class #def _notify(self, message): # if self._data_manager: # self._data_manager.notify_observers(message)
[docs] def get_dataframes(self): """ Retrieves and returns a dictionary of all pandas DataFrames currently stored in the data repository. """ dataframes = {} # Create an empty dictionary to store DataFrames for attr_name, attr_value in self.data_repository.items(): # Iterate over all items in the data repository if isinstance(attr_value, pd.DataFrame): # Check if the attribute is a DataFrame dataframes[attr_name] = attr_value return dataframes
[docs] def get_all_variables(self): """ Returns a list of all keys representing the data currently stored in the data repository. """ return list(self.data_repository.keys())
[docs] def reset_values(self, df_names_to_reset=None, clear_all=False): """ Resets specific DataFrames to empty DataFrames or clears all data within the class back to default initialization values. This method can target specific DataFrames for resetting, or reinitialize the class data repository entirely based on the parameters provided. Parameters ---------- df_names_to_reset : list of str, optional A list of keys (string) identifying which DataFrames within the data repository should be reset to their default empty state. This parameter is ignored if `clear_all` is True. clear_all : bool, optional A flag indicating whether to reset all data within the class to their default values. If True, it overrides `df_names_to_reset` and reinitializes the entire data repository to default values specified in the class constructor. Note ---- This method selectively resets data based on provided parameters, allowing for flexible data management within the class instance. """ # Reinitialize the entire data repository to default values if clear_all is True if clear_all: self.data_repository = None # Force break any existing references self._initialize_repository() # Calls the class constructor to simply reinitialize to reset all values elif df_names_to_reset: # Loop through the list of DataFrame names provided for resetting for df_name in df_names_to_reset: if df_name in self.data_repository and isinstance(self.data_repository[df_name], pd.DataFrame): # Reset the specified DataFrame to an empty DataFrame self.data_repository[df_name] = pd.DataFrame() # Optionally, handle resetting other types of data based on key elif df_name in self.data_repository: # Resetting non-DataFrame data to None or a default value self.data_repository[df_name] = None else: # Raise an error if a provided key does not exist in the data repository napari_show_warning(f"Key '{df_name}' does not exist in the data repository.")
[docs] def update_metadata(self, image): """ Update metadata while preserving other state. Parameters ---------- image : AICSImage Image object containing metadata to extract """ #print("Update_metadata called on instance:", id(self)) try: # Check if the image has multiple scenes (e.g., z-stack) num_scenes = len(image.scenes) if num_scenes > 1: # Optionally, handle multiple scenes. Here, we'll use the first scene. scene = image.get_scene(0) physical_pixel_sizes = scene.physical_pixel_sizes metadata = scene.metadata else: physical_pixel_sizes = image.physical_pixel_sizes metadata = image.metadata # Safely extract X and Y resolutions x_resolution = getattr(physical_pixel_sizes, 'X', None) y_resolution = getattr(physical_pixel_sizes, 'Y', None) if x_resolution is not None and y_resolution is not None: self.data_repository['microns_per_pixel_sq'] = x_resolution * y_resolution else: napari_show_warning( "Resolution data incomplete, using default value of 1 (um/px)^2." ) self.data_repository['microns_per_pixel_sq'] = 1 # Safely assign metadata if metadata: self.data_repository['metadata'] = metadata else: napari_show_warning( "Metadata is empty or unavailable, using empty dictionary." ) self.data_repository['metadata'] = {} except AttributeError as e: napari_show_warning( f"Attribute error encountered: {e}. Using default values." ) self.data_repository['microns_per_pixel_sq'] = 1 self.data_repository['metadata'] = {} except Exception as e: napari_show_warning( f"Unexpected error while updating metadata: {e}. Using default values." ) self.data_repository['microns_per_pixel_sq'] = 1 self.data_repository['metadata'] = {}
[docs] def calculate_length(self, viewer): """ Utilizes annotations made in a napari viewer to calculate and update size parameters for objects and cells, such as diameters and radii. This method assumes specific naming conventions for annotation layers within the viewer. Parameters ---------- viewer : napari.Viewer A napari viewer instance containing annotations for calculating object and cell sizes. Notes ----- This method is designed to work with specific annotation layers named 'Cell Diameter' and 'Object Diameter' in the viewer, which are assumed to contain line annotations representing the diameters of cells and objects, respectively. The calculated sizes are stored in the data repository under the keys 'cell_diameter', 'object_size', and 'ball_radius'. """ # Get the shapes layers cell_size_layer = viewer.layers['Cell Diameter'] if 'Cell Diameter' in viewer.layers else None object_size_layer = viewer.layers['Object Diameter'] if 'Object Diameter' in viewer.layers else None # Get the coordinates of the last shape drawn if cell_size_layer is not None: cell_coords = cell_size_layer.data[-1] if cell_size_layer.data else None else: cell_coords = None if object_size_layer is not None: object_coords = object_size_layer.data[-1] if object_size_layer.data else None else: object_coords = None # Calculate the distance using the Euclidean distance formula if cell_coords is not None: self.data_repository['cell_diameter'] = euclidean(cell_coords[0], cell_coords[1]) else: napari_show_warning(f"No cell diameter found, using default value. Please draw a line to measure the cell diameter.") if object_coords is not None: self.data_repository['object_size'] = euclidean(object_coords[0], object_coords[1]) object_radius = self.data_repository['object_size'] / 2 self.data_repository['ball_radius'] = math.ceil(1.5*object_radius) else: napari_show_warning("No object diameter found, using default value. Please draw a line to measure the object diameter.") # Calculate the microns per pixel microns_per_pixel_sq = self.data_repository['microns_per_pixel_sq'] microns_per_pixel = np.sqrt(microns_per_pixel_sq) round(microns_per_pixel, 2) #key = 'ball_radius' #self._notify(f"Data {key} in data class has been set!") # Print the length of the line napari_show_info("The diameter of the cell is: " f"{round(self.data_repository['cell_diameter']*microns_per_pixel, 2)}um ({round(self.data_repository['cell_diameter'], 2)}px)," "the diameter of the object is: " f"{round(self.data_repository['object_size']*microns_per_pixel, 2)}um ({round(self.data_repository['object_size'], 2)}px)" )
#napari_show_info("This is a message for Napari users!")