Source code for circstudio.io.mask

import pandas as pd
import numpy as np
import os
import warnings
import pyexcel as pxl
from pandas.tseries.frequencies import to_offset

from ..analysis.tools import *


[docs] class BaseLog: """ Read and store time intervals (start/stop times) from a "log file". In circadian/actigraphy analysis workflows, you often keep a separate log describing periods to exclude or flag, e.g.: - Naps or bedtimes recorded manually - Mask periods suspected of being spurious inactivity (e.g., non-wear) This class reads a table with three columns: - An identifier (e.g., "Mask", "Nap", "Event") used as the row index. - Start time (datetime) - Stop time (datetime) After reading, the log is stored as a pandas DataFrame with an additional column: Duration = Stop_time - Start_Time Supported file formats ---------------------- - CSV: .csv - Spreadsheet-like: .xlsx, .xls, .ods (via pyexcel) Parameters ---------- input_fname : str Path to the input log file (relative or absolute). log : pandas.DataFrame A DataFrame containing at least: - Start_time (datetime64) - Stop_time (datetime64) Attributes ---------- fname : str Absolute path to the log file used. log : pandas.DataFrame Cleaned log table with columns ``Start_time``, ``Stop_time``, and ``Duration``. """ def __init__(self, input_fname, log): # get absolute file path self.__fname = os.path.abspath(input_fname) # Add `duration` column log["Duration"] = log["Stop_time"] - log["Start_time"] # Inplace drop of NA log.dropna(inplace=True) # add dataframe self.__log = log
[docs] @classmethod def from_file(cls, input_fname, index_name, *args, **kwargs): """Read start/stop-times from log files. Generic function to read start and stop times from log files. Supports different file format (.ods, .xls(x), .csv). Parameters ---------- input_fname: str Path to the log file. index_name: str Name of the index. *args Variable length argument list passed to the subsequent reader function. **kwargs Arbitrary keyword arguments passed to the subsequent reader function. Returns ------- absname: str Absolute filepath of the input log file. log: pandas.DataFrame Dataframe containing the data found in the log file. """ # get absolute file path absname = os.path.abspath(input_fname) # get basename and split it into base and extension basename = os.path.basename(absname) _, ext = os.path.splitext(basename) if ext == ".csv": log = cls.__from_csv(absname, index_name, *args, **kwargs) elif (ext == ".xlsx") or (ext == ".xls") or (ext == ".ods"): log = cls.__from_excel(absname, index_name, *args, **kwargs) else: raise ValueError( ( "File format for the input file {}".format(basename) + "is not currently supported." + "Supported file format:\n" + ".csv (text),\n" + ".ods (OpenOffice spreadsheet),\n" + ".xls (Excel spreadsheet)." ) ) return absname, log
@classmethod def __from_csv(cls, input_fname, index_name, sep=",", dayfirst=False): """Read start/stop-times from .csv files. Specific function to read start and stop times from csv files. Parameters ---------- input_fname: str Path to the log file. index_name: str Name of the index. sep: str, optional Delimiter to use. Default is ','. dayfirst: bool, optional If set to True, use DD/MM/YYYY format dates. Default is False. Returns ------- log : a pandas.DataFrame A dataframe with the start and stop times (columns) """ # Read data from the csv file into a dataframe log = pd.read_csv( input_fname, sep=sep, dayfirst=dayfirst, header=0, index_col=[0], usecols=[0, 1, 2], names=[index_name, "Start_time", "Stop_time"], parse_dates=[1, 2], ) return log @classmethod def __from_excel(cls, input_fname, index_name): """Read start/stop-times from excel-like files. Specific function to read start and stop times from .ods/.xls(x) files. Parameters ---------- input_fname: str Path to the log file. index_name: str Name of the index. Returns ------- log : a pandas.DataFrame A dataframe with the start and stop times (columns) """ # Read data from the log file into a np array sst_narray = np.array(pxl.get_array(file_name=input_fname)) # Create a DF with columns: index_name, start_time, stop time log = pd.DataFrame( sst_narray[1:, 1:3], index=sst_narray[1:, 0], columns=["Start_time", "Stop_time"], # dtype='datetime64[ns]' ) log.index.name = index_name return log @property def fname(self): """The absolute filepath of the input log file.""" return self.__fname @property def log(self): """The dataframe containing the data found in the log file.""" return self.__log
[docs] def summary(self, colname): """Returns a dataframe of summary statistics.""" return self.__log[colname].describe()
[docs] class Mask: """ Mixin that adds masking + preprocessing for actigraphy/light time series. This class is designed to be *mixed into* a higher-level recording object that already provides: Required attributes on the host class ------------------------------------- - ``activity`` : pandas.Series or None Activity counts indexed by time (DatetimeIndex). - ``light`` : pandas.Series or None Light intensity (e.g., lux) indexed by time (DatetimeIndex). - ``frequency`` : pandas.Timedelta Sampling epoch (e.g., 30 seconds, 1 minute). - ``start_time`` : pandas.Timestamp Beginning of the analysis window. - ``period`` : pandas.Timedelta Duration of the analysis window. What a “mask” means ------------------- A mask is a time series aligned to the data index (same timestamps), typically: - 1 = keep the data point - 0 = exclude the data point (e.g., non-wear, sensor failure) Masking is commonly used to remove long stretches of zero activity that likely represent *device removal*, not true sleep/inactivity. Key features ------------ - Auto-create an inactivity mask: flag sustained “zero-activity” stretches. - Manually add mask intervals using start/stop times. - Apply preprocessing filters: - resampling to a new epoch - binarization (thresholding) - optional missing-data imputation - mask-based exclusion Parameters ---------- exclude_if_mask : bool If True, masked samples are excluded downstream (e.g., set to NaN or removed, depending on the processing function). mask_inactivity : bool If True, apply the inactivity mask when filtering. binarize : bool If True, binarize the signal using ``threshold``. threshold : float Threshold used when ``binarize=True``. inactivity_length : int or None Minimum number of consecutive zero-activity epochs to be considered “inactivity” (potential non-wear). If None, no inactivity mask is created automatically. mask : pandas.Series or None A precomputed mask series aligned to the data index (1=keep, 0=exclude). Attributes ---------- mask : pandas.Series Mask aligned to the current analysis window. If no mask exists but ``inactivity_length`` is set, it is created automatically on first access. inactivity_length : int or None Minimum consecutive zero epochs defining an inactivity segment. """ def __init__(self, exclude_if_mask, mask_inactivity, binarize, threshold, inactivity_length, mask): self.exclude_if_mask = exclude_if_mask self._inactivity_length = inactivity_length self._mask_inactivity = mask_inactivity self._original_activity = self.activity self._original_light = self.light if self.light is not None else None self.binarize = binarize self.threshold = threshold if binarize else None self.impute_nan = False self.imputation_method = 'mean' self._mask = mask def _filter_data(self, data, new_freq, binarize, impute_nan, threshold, exclude_if_mask, imputation_method): self.threshold = threshold self.binarize = binarize self.impute_nan = impute_nan self.imputation_method = imputation_method self.exclude_if_mask = exclude_if_mask return _data_processor( data=data, binarize=self.binarize, threshold=self.threshold, current_freq=self.frequency, new_freq=self.frequency if new_freq is None else new_freq, mask=self._mask, mask_inactivity=self._mask_inactivity, impute_nan=self.impute_nan, imputation_method=self.imputation_method, exclude_if_mask=self.exclude_if_mask, )
[docs] def apply_filters(self, new_freq=None, binarize=False, threshold=0, apply_mask=False, impute_nan=False, exclude_if_mask=False, imputation_method='mean'): """ Apply preprocessing to activity and/or light time series. This method optionally: - Resamples the data to a new epoch length (e.g., 30s to 1 min). - Binarizes using a threshold - Applies masking (exclude masked samples) - Impute missing values Parameters ---------- new_freq : str or pandas offset, optional New epoch length (e.g., "1min", "30s"). If None, keep current frequency. binarize : bool If True, binarize values using ``threshold``. threshold : float Threshold for binarization. apply_mask : bool If True, apply the current mask (and create inactivity mask if configured). impute_nan : bool If True, fill NaNs using ``imputation_method``. exclude_if_mask : bool If True, masked values are excluded (implementation depends on the processing backend). imputation_method : {"mean", "median", ...} Strategy for filling missing values. """ # Reset filters before applying new ones self.reset_filters() if apply_mask: if self._mask is None: print('No mask was found. Create a new mask.') else: self._mask_inactivity = True if self.activity is not None: self.activity = self._filter_data(self.activity, new_freq=new_freq, binarize=binarize, threshold=threshold, impute_nan=impute_nan, exclude_if_mask=False, imputation_method=imputation_method) if self.light is not None: self.light = self._filter_data(self.light, new_freq=new_freq, binarize=binarize, threshold=threshold, impute_nan=impute_nan, exclude_if_mask=False, imputation_method=imputation_method)
[docs] def reset_filters(self): self._mask_inactivity = False if self.activity is not None: self.activity = self._original_activity if self.light is not None: self.light = self._original_light
@property def mask(self): r"""Mask used to filter out inactive data.""" if self._mask is None: # Create a mask if it does not exist if self._inactivity_length is not None: # Create an inactivity mask with the specified length (and above) self.create_inactivity_mask(self._inactivity_length) return self._mask.loc[self.start_time : self.start_time + self.period] else: print("Inactivity length set to None. Could not create a mask.") else: return self._mask.loc[self.start_time : self.start_time + self.period] @mask.setter def mask(self, value): self._mask = value @property def inactivity_length(self): r"""Length of the inactivity mask.""" return self._inactivity_length @inactivity_length.setter def inactivity_length(self, value): self._inactivity_length = value # Discard current mask (will be recreated upon access if needed) self._mask = None # Set switch to False if None if value is None: self._mask_inactivity = False
[docs] def create_inactivity_mask(self, duration): """Create a mask for inactivity (count equal to zero) periods. This mask has the same length as its underlying data and can be used to obfuscate inactive periods where the actimeter has most likely been removed. Warning: use a sufficiently long duration in order not to mask sleep periods. A minimal duration corresponding to two hours seems reasonable. Parameters ---------- duration: int or str Minimal number of consecutive zeroes for an inactive period. Time offset strings (ex: '90min') can also be used. """ if isinstance(duration, int): nepochs = duration elif isinstance(duration, str): nepochs = int(pd.Timedelta(duration) / self.frequency) else: nepochs = None warnings.warn( "Inactivity length must be a int and time offset string (ex: " "'90min'). Could not create a mask.", UserWarning, ) # Store requested mask duration (and discard current mask) self.inactivity_length = nepochs # Create actual mask self.mask = _create_inactivity_mask(self.activity, nepochs, 1)
[docs] def add_mask_period(self, start, stop): """Add a period to the inactivity mask Parameters ---------- start: str Start time (YYYY-MM-DD HH:MM:SS) of the inactivity period. stop: str Stop time (YYYY-MM-DD HH:MM:SS) of the inactivity period. """ # Check if a mask has already been created # NB : if the inactivity_length is not None, accessing the mask will # trigger its creation. if self.inactivity_length is None: self.inactivity_length = -1 # self.mask = pd.Series( # np.ones(self.length()), # index=self.data.index # ) # Check if start and stop are within the index range if pd.Timestamp(start) < self.mask.index[0]: raise ValueError( ( "Attempting to set the start time of a mask period before " + "the actual start time of the data.\n" + "Mask start time: {}".format(start) + "Data start time: {}".format(self.mask.index[0]) ) ) if pd.Timestamp(stop) > self.mask.index[-1]: raise ValueError( ( "Attempting to set the stop time of a mask period after " + "the actual stop time of the data.\n" + "Mask stop time: {}".format(stop) + "Data stop time: {}".format(self.mask.index[-1]) ) ) # Set mask values between start and stop to zeros # self.mask.loc[start:stop] = 0 self.mask = self.mask.mask( (self.mask.index >= start) & (self.mask.index <= stop), 0 )
[docs] def add_mask_periods(self, input_fname, *args, **kwargs): """Add periods to the inactivity mask Function to read start and stop times from a Mask log file. Supports different file format (.ods, .xls(x), .csv). Parameters ---------- input_fname: str Path to the log file. *args Variable length argument list passed to the subsequent reader function. **kwargs Arbitrary keyword arguments passed to the subsequent reader function. """ # Convert the log file into a DataFrame absname, log = BaseLog.from_file(input_fname, "Mask", *args, **kwargs) # Iterate over the rows of the DataFrame for _, row in log.iterrows(): self.add_mask_period(row["Start_time"], row["Stop_time"])