Source code for circstudio.io.mask

import pandas as pd
import numpy as np
import os
import warnings
import pyexcel as pxl
from pandas.tseries.frequencies import to_offset

from ..analysis.tools import *



[docs]
class BaseLog:
    """
    Read and store time intervals (start/stop times) from a "log file".

    In circadian/actigraphy analysis workflows, you often keep a separate log
    describing periods to exclude or flag, e.g.:
    - Naps or bedtimes recorded manually
    - Mask periods suspected of being spurious inactivity (e.g., non-wear)

    This class reads a table with three columns:
    - An identifier (e.g., "Mask", "Nap", "Event") used as the row index.
    - Start time (datetime)
    - Stop time (datetime)

    After reading, the log is stored as a pandas DataFrame with an additional
    column: Duration = Stop_time - Start_Time

    Supported file formats
    ----------------------
    - CSV: .csv
    - Spreadsheet-like: .xlsx, .xls, .ods (via pyexcel)

    Parameters
    ----------
    input_fname : str
        Path to the input log file (relative or absolute).
    log : pandas.DataFrame
        A DataFrame containing at least:
        - Start_time (datetime64)
        - Stop_time (datetime64)

    Attributes
    ----------
    fname : str
        Absolute path to the log file used.
    log : pandas.DataFrame
        Cleaned log table with columns ``Start_time``, ``Stop_time``, and ``Duration``.
    """

    def __init__(self, input_fname, log):
        # get absolute file path
        self.__fname = os.path.abspath(input_fname)

        # Add `duration` column
        log["Duration"] = log["Stop_time"] - log["Start_time"]

        # Inplace drop of NA
        log.dropna(inplace=True)

        # add dataframe
        self.__log = log


[docs]
    @classmethod
    def from_file(cls, input_fname, index_name, *args, **kwargs):
        """Read start/stop-times from log files.

        Generic function to read start and stop times from log files. Supports
        different file format (.ods, .xls(x), .csv).

        Parameters
        ----------
        input_fname: str
            Path to the log file.
        index_name: str
            Name of the index.
        *args
            Variable length argument list passed to the subsequent reader
            function.
        **kwargs
            Arbitrary keyword arguments passed to the subsequent reader
            function.

        Returns
        -------
        absname: str
            Absolute filepath of the input log file.
        log: pandas.DataFrame
            Dataframe containing the data found in the log file.

        """

        # get absolute file path
        absname = os.path.abspath(input_fname)

        # get basename and split it into base and extension
        basename = os.path.basename(absname)
        _, ext = os.path.splitext(basename)

        if ext == ".csv":
            log = cls.__from_csv(absname, index_name, *args, **kwargs)
        elif (ext == ".xlsx") or (ext == ".xls") or (ext == ".ods"):
            log = cls.__from_excel(absname, index_name, *args, **kwargs)
        else:
            raise ValueError(
                (
                    "File format for the input file {}".format(basename)
                    + "is not currently supported."
                    + "Supported file format:\n"
                    + ".csv (text),\n"
                    + ".ods (OpenOffice spreadsheet),\n"
                    + ".xls (Excel spreadsheet)."
                )
            )
        return absname, log


    @classmethod
    def __from_csv(cls, input_fname, index_name, sep=",", dayfirst=False):
        """Read start/stop-times from .csv files.

        Specific function to read start and stop times from csv files.

        Parameters
        ----------
        input_fname: str
            Path to the log file.
        index_name: str
            Name of the index.
        sep: str, optional
            Delimiter to use.
            Default is ','.
        dayfirst: bool, optional
            If set to True, use DD/MM/YYYY format dates.
            Default is False.

        Returns
        -------
        log : a pandas.DataFrame
            A dataframe with the start and stop times (columns)

        """

        # Read data from the csv file into a dataframe
        log = pd.read_csv(
            input_fname,
            sep=sep,
            dayfirst=dayfirst,
            header=0,
            index_col=[0],
            usecols=[0, 1, 2],
            names=[index_name, "Start_time", "Stop_time"],
            parse_dates=[1, 2],
        )
        return log

    @classmethod
    def __from_excel(cls, input_fname, index_name):
        """Read start/stop-times from excel-like files.

        Specific function to read start and stop times from .ods/.xls(x) files.

        Parameters
        ----------
        input_fname: str
            Path to the log file.
        index_name: str
            Name of the index.

        Returns
        -------
        log : a pandas.DataFrame
            A dataframe with the start and stop times (columns)

        """

        # Read data from the log file into a np array
        sst_narray = np.array(pxl.get_array(file_name=input_fname))

        # Create a DF with columns: index_name, start_time, stop time
        log = pd.DataFrame(
            sst_narray[1:, 1:3],
            index=sst_narray[1:, 0],
            columns=["Start_time", "Stop_time"],
            # dtype='datetime64[ns]'
        )
        log.index.name = index_name

        return log

    @property
    def fname(self):
        """The absolute filepath of the input log file."""
        return self.__fname

    @property
    def log(self):
        """The dataframe containing the data found in the log file."""
        return self.__log


[docs]
    def summary(self, colname):
        """Returns a dataframe of summary statistics."""
        return self.__log[colname].describe()





[docs]
class Mask:
    """
    Mixin that adds masking + preprocessing for actigraphy/light time series.

    This class is designed to be *mixed into* a higher-level recording object that
    already provides:

    Required attributes on the host class
    -------------------------------------
    - ``activity`` : pandas.Series or None
        Activity counts indexed by time (DatetimeIndex).
    - ``light`` : pandas.Series or None
        Light intensity (e.g., lux) indexed by time (DatetimeIndex).
    - ``frequency`` : pandas.Timedelta
        Sampling epoch (e.g., 30 seconds, 1 minute).
    - ``start_time`` : pandas.Timestamp
        Beginning of the analysis window.
    - ``period`` : pandas.Timedelta
        Duration of the analysis window.

    What a “mask” means
    -------------------
    A mask is a time series aligned to the data index (same timestamps), typically:
    - 1 = keep the data point
    - 0 = exclude the data point (e.g., non-wear, sensor failure)

    Masking is commonly used to remove long stretches of zero activity that likely
    represent *device removal*, not true sleep/inactivity.

    Key features
    ------------
    - Auto-create an inactivity mask: flag sustained “zero-activity” stretches.
    - Manually add mask intervals using start/stop times.
    - Apply preprocessing filters:
        - resampling to a new epoch
        - binarization (thresholding)
        - optional missing-data imputation
        - mask-based exclusion

    Parameters
    ----------
    exclude_if_mask : bool
        If True, masked samples are excluded downstream (e.g., set to NaN or removed,
        depending on the processing function).
    mask_inactivity : bool
        If True, apply the inactivity mask when filtering.
    binarize : bool
        If True, binarize the signal using ``threshold``.
    threshold : float
        Threshold used when ``binarize=True``.
    inactivity_length : int or None
        Minimum number of consecutive zero-activity epochs to be considered “inactivity”
        (potential non-wear). If None, no inactivity mask is created automatically.
    mask : pandas.Series or None
        A precomputed mask series aligned to the data index (1=keep, 0=exclude).

    Attributes
    ----------
    mask : pandas.Series
        Mask aligned to the current analysis window. If no mask exists but
        ``inactivity_length`` is set, it is created automatically on first access.
    inactivity_length : int or None
        Minimum consecutive zero epochs defining an inactivity segment.
    """
    def __init__(self, exclude_if_mask, mask_inactivity, binarize, threshold, inactivity_length, mask):
        self.exclude_if_mask = exclude_if_mask
        self._inactivity_length = inactivity_length
        self._mask_inactivity = mask_inactivity
        self._original_activity = self.activity
        self._original_light = self.light if self.light is not None else None
        self.binarize = binarize
        self.threshold = threshold if binarize else None
        self.impute_nan = False
        self.imputation_method = 'mean'
        self._mask = mask

    def _filter_data(self,
                     data,
                     new_freq,
                     binarize,
                     impute_nan,
                     threshold,
                     exclude_if_mask,
                     imputation_method):
        self.threshold = threshold
        self.binarize = binarize
        self.impute_nan = impute_nan
        self.imputation_method = imputation_method
        self.exclude_if_mask = exclude_if_mask

        return _data_processor(
            data=data,
            binarize=self.binarize,
            threshold=self.threshold,
            current_freq=self.frequency,
            new_freq=self.frequency if new_freq is None else new_freq,
            mask=self._mask,
            mask_inactivity=self._mask_inactivity,
            impute_nan=self.impute_nan,
            imputation_method=self.imputation_method,
            exclude_if_mask=self.exclude_if_mask,
        )


[docs]
    def apply_filters(self,
                      new_freq=None,
                      binarize=False,
                      threshold=0,
                      apply_mask=False,
                      impute_nan=False,
                      exclude_if_mask=False,
                      imputation_method='mean'):
        """
        Apply preprocessing to activity and/or light time series.

        This method optionally:
        - Resamples the data to a new epoch length (e.g., 30s to 1 min).
        - Binarizes using a threshold
        - Applies masking (exclude masked samples)
        - Impute missing values

        Parameters
        ----------
        new_freq : str or pandas offset, optional
            New epoch length (e.g., "1min", "30s"). If None, keep current frequency.
        binarize : bool
            If True, binarize values using ``threshold``.
        threshold : float
            Threshold for binarization.
        apply_mask : bool
            If True, apply the current mask (and create inactivity mask if configured).
        impute_nan : bool
            If True, fill NaNs using ``imputation_method``.
        exclude_if_mask : bool
            If True, masked values are excluded (implementation depends on the processing backend).
        imputation_method : {"mean", "median", ...}
            Strategy for filling missing values.
        """
        # Reset filters before applying new ones
        self.reset_filters()
        if apply_mask:
            if self._mask is None:
                print('No mask was found. Create a new mask.')
            else:
                self._mask_inactivity = True

        if self.activity is not None:
            self.activity = self._filter_data(self.activity,
                                              new_freq=new_freq,
                                              binarize=binarize,
                                              threshold=threshold,
                                              impute_nan=impute_nan,
                                              exclude_if_mask=False,
                                              imputation_method=imputation_method)
        if self.light is not None:
            self.light = self._filter_data(self.light,
                                           new_freq=new_freq,
                                           binarize=binarize,
                                           threshold=threshold,
                                           impute_nan=impute_nan,
                                           exclude_if_mask=False,
                                           imputation_method=imputation_method)




[docs]
    def reset_filters(self):
        self._mask_inactivity = False
        if self.activity is not None:
            self.activity = self._original_activity
        if self.light is not None:
            self.light = self._original_light


    @property
    def mask(self):
        r"""Mask used to filter out inactive data."""
        if self._mask is None:
            # Create a mask if it does not exist
            if self._inactivity_length is not None:
                # Create an inactivity mask with the specified length (and above)
                self.create_inactivity_mask(self._inactivity_length)
                return self._mask.loc[self.start_time : self.start_time + self.period]
            else:
                print("Inactivity length set to None. Could not create a mask.")
        else:
            return self._mask.loc[self.start_time : self.start_time + self.period]

    @mask.setter
    def mask(self, value):
        self._mask = value

    @property
    def inactivity_length(self):
        r"""Length of the inactivity mask."""
        return self._inactivity_length

    @inactivity_length.setter
    def inactivity_length(self, value):
        self._inactivity_length = value
        # Discard current mask (will be recreated upon access if needed)
        self._mask = None
        # Set switch to False if None
        if value is None:
            self._mask_inactivity = False



[docs]
    def create_inactivity_mask(self, duration):
        """Create a mask for inactivity (count equal to zero) periods.

        This mask has the same length as its underlying data and can be used
        to obfuscate inactive periods where the actimeter has most likely been
        removed.
        Warning: use a sufficiently long duration in order not to mask sleep
        periods.
        A minimal duration corresponding to two hours seems reasonable.

        Parameters
        ----------
        duration: int or str
            Minimal number of consecutive zeroes for an inactive period.
            Time offset strings (ex: '90min') can also be used.
        """

        if isinstance(duration, int):
            nepochs = duration
        elif isinstance(duration, str):
            nepochs = int(pd.Timedelta(duration) / self.frequency)
        else:
            nepochs = None
            warnings.warn(
                "Inactivity length must be a int and time offset string (ex: "
                "'90min'). Could not create a mask.",
                UserWarning,
            )

        # Store requested mask duration (and discard current mask)
        self.inactivity_length = nepochs

        # Create actual mask
        self.mask = _create_inactivity_mask(self.activity, nepochs, 1)




[docs]
    def add_mask_period(self, start, stop):
        """Add a period to the inactivity mask

        Parameters
        ----------
        start: str
            Start time (YYYY-MM-DD HH:MM:SS) of the inactivity period.
        stop: str
            Stop time (YYYY-MM-DD HH:MM:SS) of the inactivity period.
        """

        # Check if a mask has already been created
        # NB : if the inactivity_length is not None, accessing the mask will
        # trigger its creation.
        if self.inactivity_length is None:
            self.inactivity_length = -1
            # self.mask = pd.Series(
            #     np.ones(self.length()),
            #     index=self.data.index
            # )

        # Check if start and stop are within the index range
        if pd.Timestamp(start) < self.mask.index[0]:
            raise ValueError(
                (
                    "Attempting to set the start time of a mask period before "
                    + "the actual start time of the data.\n"
                    + "Mask start time: {}".format(start)
                    + "Data start time: {}".format(self.mask.index[0])
                )
            )
        if pd.Timestamp(stop) > self.mask.index[-1]:
            raise ValueError(
                (
                    "Attempting to set the stop time of a mask period after "
                    + "the actual stop time of the data.\n"
                    + "Mask stop time: {}".format(stop)
                    + "Data stop time: {}".format(self.mask.index[-1])
                )
            )

        # Set mask values between start and stop to zeros
        # self.mask.loc[start:stop] = 0
        self.mask = self.mask.mask(
            (self.mask.index >= start) & (self.mask.index <= stop), 0
        )



[docs]
    def add_mask_periods(self, input_fname, *args, **kwargs):
        """Add periods to the inactivity mask

        Function to read start and stop times from a Mask log file. Supports
        different file format (.ods, .xls(x), .csv).

        Parameters
        ----------
        input_fname: str
            Path to the log file.
        *args
            Variable length argument list passed to the subsequent reader
            function.
        **kwargs
            Arbitrary keyword arguments passed to the subsequent reader
            function.
        """

        # Convert the log file into a DataFrame
        absname, log = BaseLog.from_file(input_fname, "Mask", *args, **kwargs)

        # Iterate over the rows of the DataFrame
        for _, row in log.iterrows():
            self.add_mask_period(row["Start_time"], row["Stop_time"])