Source code for pycat.utils.math_utils
"""
Math Utilities Module for PyCAT
This module contains utility functions for mathematical operations that are used in the PyCAT application.
The functions include outlier removal, R squared calculation, and Gaussian kernel generation. These functions
are used in various parts of the application for data processing and analysis.
Author
------
Christian Neureuter, GitHub: https://github.com/cneureuter
Date
----
4-20-2024
"""
# Third party imports
import numpy as np
[docs]
def remove_outliers_iqr(data):
"""
Remove outliers from a dataset using the Interquartile Range (IQR) method. This method
calculates the IQR as the difference between the 75th and 25th percentiles of the data.
Data points outside 1.5 times the IQR from the quartiles are considered outliers and are
removed. This technique is robust to extreme values that could skew the data distribution.
Parameters
----------
data : numpy.ndarray
A numpy array containing the dataset from which outliers will be removed. The array
can be of any shape but will be flattened for processing.
Returns
-------
filtered_data : numpy.ndarray
A numpy array containing the data after outlier removal. The shape of `filtered_data`
might be smaller than `data` if outliers were found and removed. The data is returned
in the same shape it was input.
Notes
-----
The IQR method is often preferred over z-score or standard deviation methods for outlier
removal in cases where the data may not follow a normal distribution. This approach is
based on quartile measurements, thus it is less sensitive to extreme values.
The bounds for outlier detection are calculated as 1.5 times the IQR below the 25th percentile
and 1.5 times the IQR above the 75th percentile. This method assumes that the data distribution
is approximately symmetric around the median.
Examples
--------
>>> data = np.array([1, 2, 3, 4, 5, 6, 100])
>>> filtered_data = remove_outliers_iqr(data)
>>> print(filtered_data)
[1 2 3 4 5 6]
"""
# Calculate the first and third quartiles (Q1 and Q3)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
# Calculate the Interquartile Range (IQR)
iqr = q3 - q1
# Determine the lower and upper bounds for outlier detection
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# Filtering out the outliers
filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]
return filtered_data
[docs]
def calculate_r_squared(actual, predicted):
"""
Calculate the coefficient of determination (R squared), which assesses the goodness of fit
between actual and predicted values from a regression model.
R squared quantifies the proportion of variance in the dependent variable that is predictable
from the independent variables. A value of 1 indicates a perfect fit, a value of 0 indicates that
the model predicts none of the variability of the response data around its mean.
Parameters
----------
actual : numpy.array
The actual values observed; the dependent variable.
predicted : numpy.array
The values predicted by a regression model; the independent variable predictions.
Returns
-------
r_squared : float
The R squared value, a statistic that ranges from 0 to 1, where higher values indicate a better fit.
"""
# Calculate the residual sum of squares (difference between actual and predicted values)
residual_sum_of_squares = np.sum((actual - predicted) ** 2)
# Calculate the total sum of squares (variability of the actual values)
total_sum_of_squares = np.sum((actual - np.mean(actual)) ** 2)
# Compute R squared using its formula
r_squared = - (residual_sum_of_squares / total_sum_of_squares)
return r_squared
[docs]
def create_2d_gaussian_kernel(kernel_size, sigma=None):
"""
Generate a 2D Gaussian kernel, which is commonly used as a Point Spread Function (PSF) in image processing
applications, particularly for simulating a Gaussian blur effect.
The kernel is a square matrix with dimensions defined by `kernel_size`, centered on the Gaussian peak.
The sum of all elements in the kernel is normalized to 1, ensuring no change in the overall image brightness
after convolution.
Parameters
----------
kernel_size : int
The size (height and width) of the square Gaussian matrix.
sigma : float, optional
The standard deviation of the Gaussian distribution. Defaults to a calculation based on the kernel size
that approximates the behavior of a Gaussian blur in image processing contexts.
Returns
-------
kernel : numpy.array
A 2D numpy array representing the Gaussian kernel. The kernel values follow a Gaussian distribution,
centered in the matrix, and normalized such that the sum equals 1.
Notes
-----
A default sigma is calculated if not provided, using a formula that balances between spread and central peak
intensity based on the kernel size.
"""
# Calculate the default sigma value if not provided
if sigma is None:
sigma = 0.3*((kernel_size-1)*0.5 - 1) + 0.8 # Calculated as per OpenCV documentation
# Create a 1D kernel
ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
# Create a 2D kernel by meshgrid
xx, yy = np.meshgrid(ax, ax)
# Create the Gaussian kernel using the 2D Gaussian formula
kernel = np.exp(-0.5 * (np.square(xx) + np.square(yy)) / np.square(sigma))
# Normalize the kernel to ensure the sum of all elements equals 1
kernel = kernel / np.sum(kernel)
return kernel