Source code for pycat.utils.math_utils

"""
Math Utilities Module for PyCAT

This module contains utility functions for mathematical operations that are used in the PyCAT application.
The functions include outlier removal, R squared calculation, and Gaussian kernel generation. These functions
are used in various parts of the application for data processing and analysis. 

Author
------
    Christian Neureuter, GitHub: https://github.com/cneureuter

Date
----
    4-20-2024
"""

# Third party imports
import numpy as np


[docs] def remove_outliers_iqr(data): """ Remove outliers from a dataset using the Interquartile Range (IQR) method. This method calculates the IQR as the difference between the 75th and 25th percentiles of the data. Data points outside 1.5 times the IQR from the quartiles are considered outliers and are removed. This technique is robust to extreme values that could skew the data distribution. Parameters ---------- data : numpy.ndarray A numpy array containing the dataset from which outliers will be removed. The array can be of any shape but will be flattened for processing. Returns ------- filtered_data : numpy.ndarray A numpy array containing the data after outlier removal. The shape of `filtered_data` might be smaller than `data` if outliers were found and removed. The data is returned in the same shape it was input. Notes ----- The IQR method is often preferred over z-score or standard deviation methods for outlier removal in cases where the data may not follow a normal distribution. This approach is based on quartile measurements, thus it is less sensitive to extreme values. The bounds for outlier detection are calculated as 1.5 times the IQR below the 25th percentile and 1.5 times the IQR above the 75th percentile. This method assumes that the data distribution is approximately symmetric around the median. Examples -------- >>> data = np.array([1, 2, 3, 4, 5, 6, 100]) >>> filtered_data = remove_outliers_iqr(data) >>> print(filtered_data) [1 2 3 4 5 6] """ # Calculate the first and third quartiles (Q1 and Q3) q1 = np.percentile(data, 25) q3 = np.percentile(data, 75) # Calculate the Interquartile Range (IQR) iqr = q3 - q1 # Determine the lower and upper bounds for outlier detection lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr # Filtering out the outliers filtered_data = data[(data >= lower_bound) & (data <= upper_bound)] return filtered_data
[docs] def calculate_r_squared(actual, predicted): """ Calculate the coefficient of determination (R squared), which assesses the goodness of fit between actual and predicted values from a regression model. R squared quantifies the proportion of variance in the dependent variable that is predictable from the independent variables. A value of 1 indicates a perfect fit, a value of 0 indicates that the model predicts none of the variability of the response data around its mean. Parameters ---------- actual : numpy.array The actual values observed; the dependent variable. predicted : numpy.array The values predicted by a regression model; the independent variable predictions. Returns ------- r_squared : float The R squared value, a statistic that ranges from 0 to 1, where higher values indicate a better fit. """ # Calculate the residual sum of squares (difference between actual and predicted values) residual_sum_of_squares = np.sum((actual - predicted) ** 2) # Calculate the total sum of squares (variability of the actual values) total_sum_of_squares = np.sum((actual - np.mean(actual)) ** 2) # Compute R squared using its formula r_squared = - (residual_sum_of_squares / total_sum_of_squares) return r_squared
[docs] def create_2d_gaussian_kernel(kernel_size, sigma=None): """ Generate a 2D Gaussian kernel, which is commonly used as a Point Spread Function (PSF) in image processing applications, particularly for simulating a Gaussian blur effect. The kernel is a square matrix with dimensions defined by `kernel_size`, centered on the Gaussian peak. The sum of all elements in the kernel is normalized to 1, ensuring no change in the overall image brightness after convolution. Parameters ---------- kernel_size : int The size (height and width) of the square Gaussian matrix. sigma : float, optional The standard deviation of the Gaussian distribution. Defaults to a calculation based on the kernel size that approximates the behavior of a Gaussian blur in image processing contexts. Returns ------- kernel : numpy.array A 2D numpy array representing the Gaussian kernel. The kernel values follow a Gaussian distribution, centered in the matrix, and normalized such that the sum equals 1. Notes ----- A default sigma is calculated if not provided, using a formula that balances between spread and central peak intensity based on the kernel size. """ # Calculate the default sigma value if not provided if sigma is None: sigma = 0.3*((kernel_size-1)*0.5 - 1) + 0.8 # Calculated as per OpenCV documentation # Create a 1D kernel ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.) # Create a 2D kernel by meshgrid xx, yy = np.meshgrid(ax, ax) # Create the Gaussian kernel using the 2D Gaussian formula kernel = np.exp(-0.5 * (np.square(xx) + np.square(yy)) / np.square(sigma)) # Normalize the kernel to ensure the sum of all elements equals 1 kernel = kernel / np.sum(kernel) return kernel