import pandas as pd
import numpy as np
import os
import warnings
import pyexcel as pxl
from pandas.tseries.frequencies import to_offset
from ..analysis.tools import *
[docs]
class BaseLog:
"""
Read and store time intervals (start/stop times) from a "log file".
In circadian/actigraphy analysis workflows, you often keep a separate log
describing periods to exclude or flag, e.g.:
- Naps or bedtimes recorded manually
- Mask periods suspected of being spurious inactivity (e.g., non-wear)
This class reads a table with three columns:
- An identifier (e.g., "Mask", "Nap", "Event") used as the row index.
- Start time (datetime)
- Stop time (datetime)
After reading, the log is stored as a pandas DataFrame with an additional
column: Duration = Stop_time - Start_Time
Supported file formats
----------------------
- CSV: .csv
- Spreadsheet-like: .xlsx, .xls, .ods (via pyexcel)
Parameters
----------
input_fname : str
Path to the input log file (relative or absolute).
log : pandas.DataFrame
A DataFrame containing at least:
- Start_time (datetime64)
- Stop_time (datetime64)
Attributes
----------
fname : str
Absolute path to the log file used.
log : pandas.DataFrame
Cleaned log table with columns ``Start_time``, ``Stop_time``, and ``Duration``.
"""
def __init__(self, input_fname, log):
# get absolute file path
self.__fname = os.path.abspath(input_fname)
# Add `duration` column
log["Duration"] = log["Stop_time"] - log["Start_time"]
# Inplace drop of NA
log.dropna(inplace=True)
# add dataframe
self.__log = log
[docs]
@classmethod
def from_file(cls, input_fname, index_name, *args, **kwargs):
"""Read start/stop-times from log files.
Generic function to read start and stop times from log files. Supports
different file format (.ods, .xls(x), .csv).
Parameters
----------
input_fname: str
Path to the log file.
index_name: str
Name of the index.
*args
Variable length argument list passed to the subsequent reader
function.
**kwargs
Arbitrary keyword arguments passed to the subsequent reader
function.
Returns
-------
absname: str
Absolute filepath of the input log file.
log: pandas.DataFrame
Dataframe containing the data found in the log file.
"""
# get absolute file path
absname = os.path.abspath(input_fname)
# get basename and split it into base and extension
basename = os.path.basename(absname)
_, ext = os.path.splitext(basename)
if ext == ".csv":
log = cls.__from_csv(absname, index_name, *args, **kwargs)
elif (ext == ".xlsx") or (ext == ".xls") or (ext == ".ods"):
log = cls.__from_excel(absname, index_name, *args, **kwargs)
else:
raise ValueError(
(
"File format for the input file {}".format(basename)
+ "is not currently supported."
+ "Supported file format:\n"
+ ".csv (text),\n"
+ ".ods (OpenOffice spreadsheet),\n"
+ ".xls (Excel spreadsheet)."
)
)
return absname, log
@classmethod
def __from_csv(cls, input_fname, index_name, sep=",", dayfirst=False):
"""Read start/stop-times from .csv files.
Specific function to read start and stop times from csv files.
Parameters
----------
input_fname: str
Path to the log file.
index_name: str
Name of the index.
sep: str, optional
Delimiter to use.
Default is ','.
dayfirst: bool, optional
If set to True, use DD/MM/YYYY format dates.
Default is False.
Returns
-------
log : a pandas.DataFrame
A dataframe with the start and stop times (columns)
"""
# Read data from the csv file into a dataframe
log = pd.read_csv(
input_fname,
sep=sep,
dayfirst=dayfirst,
header=0,
index_col=[0],
usecols=[0, 1, 2],
names=[index_name, "Start_time", "Stop_time"],
parse_dates=[1, 2],
)
return log
@classmethod
def __from_excel(cls, input_fname, index_name):
"""Read start/stop-times from excel-like files.
Specific function to read start and stop times from .ods/.xls(x) files.
Parameters
----------
input_fname: str
Path to the log file.
index_name: str
Name of the index.
Returns
-------
log : a pandas.DataFrame
A dataframe with the start and stop times (columns)
"""
# Read data from the log file into a np array
sst_narray = np.array(pxl.get_array(file_name=input_fname))
# Create a DF with columns: index_name, start_time, stop time
log = pd.DataFrame(
sst_narray[1:, 1:3],
index=sst_narray[1:, 0],
columns=["Start_time", "Stop_time"],
# dtype='datetime64[ns]'
)
log.index.name = index_name
return log
@property
def fname(self):
"""The absolute filepath of the input log file."""
return self.__fname
@property
def log(self):
"""The dataframe containing the data found in the log file."""
return self.__log
[docs]
def summary(self, colname):
"""Returns a dataframe of summary statistics."""
return self.__log[colname].describe()
[docs]
class Mask:
"""
Mixin that adds masking + preprocessing for actigraphy/light time series.
This class is designed to be *mixed into* a higher-level recording object that
already provides:
Required attributes on the host class
-------------------------------------
- ``activity`` : pandas.Series or None
Activity counts indexed by time (DatetimeIndex).
- ``light`` : pandas.Series or None
Light intensity (e.g., lux) indexed by time (DatetimeIndex).
- ``frequency`` : pandas.Timedelta
Sampling epoch (e.g., 30 seconds, 1 minute).
- ``start_time`` : pandas.Timestamp
Beginning of the analysis window.
- ``period`` : pandas.Timedelta
Duration of the analysis window.
What a “mask” means
-------------------
A mask is a time series aligned to the data index (same timestamps), typically:
- 1 = keep the data point
- 0 = exclude the data point (e.g., non-wear, sensor failure)
Masking is commonly used to remove long stretches of zero activity that likely
represent *device removal*, not true sleep/inactivity.
Key features
------------
- Auto-create an inactivity mask: flag sustained “zero-activity” stretches.
- Manually add mask intervals using start/stop times.
- Apply preprocessing filters:
- resampling to a new epoch
- binarization (thresholding)
- optional missing-data imputation
- mask-based exclusion
Parameters
----------
exclude_if_mask : bool
If True, masked samples are excluded downstream (e.g., set to NaN or removed,
depending on the processing function).
mask_inactivity : bool
If True, apply the inactivity mask when filtering.
binarize : bool
If True, binarize the signal using ``threshold``.
threshold : float
Threshold used when ``binarize=True``.
inactivity_length : int or None
Minimum number of consecutive zero-activity epochs to be considered “inactivity”
(potential non-wear). If None, no inactivity mask is created automatically.
mask : pandas.Series or None
A precomputed mask series aligned to the data index (1=keep, 0=exclude).
Attributes
----------
mask : pandas.Series
Mask aligned to the current analysis window. If no mask exists but
``inactivity_length`` is set, it is created automatically on first access.
inactivity_length : int or None
Minimum consecutive zero epochs defining an inactivity segment.
"""
def __init__(self, exclude_if_mask, mask_inactivity, binarize, threshold, inactivity_length, mask):
self.exclude_if_mask = exclude_if_mask
self._inactivity_length = inactivity_length
self._mask_inactivity = mask_inactivity
self._original_activity = self.activity
self._original_light = self.light if self.light is not None else None
self.binarize = binarize
self.threshold = threshold if binarize else None
self.impute_nan = False
self.imputation_method = 'mean'
self._mask = mask
def _filter_data(self,
data,
new_freq,
binarize,
impute_nan,
threshold,
exclude_if_mask,
imputation_method):
self.threshold = threshold
self.binarize = binarize
self.impute_nan = impute_nan
self.imputation_method = imputation_method
self.exclude_if_mask = exclude_if_mask
return _data_processor(
data=data,
binarize=self.binarize,
threshold=self.threshold,
current_freq=self.frequency,
new_freq=self.frequency if new_freq is None else new_freq,
mask=self._mask,
mask_inactivity=self._mask_inactivity,
impute_nan=self.impute_nan,
imputation_method=self.imputation_method,
exclude_if_mask=self.exclude_if_mask,
)
[docs]
def apply_filters(self,
new_freq=None,
binarize=False,
threshold=0,
apply_mask=False,
impute_nan=False,
exclude_if_mask=False,
imputation_method='mean'):
"""
Apply preprocessing to activity and/or light time series.
This method optionally:
- Resamples the data to a new epoch length (e.g., 30s to 1 min).
- Binarizes using a threshold
- Applies masking (exclude masked samples)
- Impute missing values
Parameters
----------
new_freq : str or pandas offset, optional
New epoch length (e.g., "1min", "30s"). If None, keep current frequency.
binarize : bool
If True, binarize values using ``threshold``.
threshold : float
Threshold for binarization.
apply_mask : bool
If True, apply the current mask (and create inactivity mask if configured).
impute_nan : bool
If True, fill NaNs using ``imputation_method``.
exclude_if_mask : bool
If True, masked values are excluded (implementation depends on the processing backend).
imputation_method : {"mean", "median", ...}
Strategy for filling missing values.
"""
# Reset filters before applying new ones
self.reset_filters()
if apply_mask:
if self._mask is None:
print('No mask was found. Create a new mask.')
else:
self._mask_inactivity = True
if self.activity is not None:
self.activity = self._filter_data(self.activity,
new_freq=new_freq,
binarize=binarize,
threshold=threshold,
impute_nan=impute_nan,
exclude_if_mask=False,
imputation_method=imputation_method)
if self.light is not None:
self.light = self._filter_data(self.light,
new_freq=new_freq,
binarize=binarize,
threshold=threshold,
impute_nan=impute_nan,
exclude_if_mask=False,
imputation_method=imputation_method)
[docs]
def reset_filters(self):
self._mask_inactivity = False
if self.activity is not None:
self.activity = self._original_activity
if self.light is not None:
self.light = self._original_light
@property
def mask(self):
r"""Mask used to filter out inactive data."""
if self._mask is None:
# Create a mask if it does not exist
if self._inactivity_length is not None:
# Create an inactivity mask with the specified length (and above)
self.create_inactivity_mask(self._inactivity_length)
return self._mask.loc[self.start_time : self.start_time + self.period]
else:
print("Inactivity length set to None. Could not create a mask.")
else:
return self._mask.loc[self.start_time : self.start_time + self.period]
@mask.setter
def mask(self, value):
self._mask = value
@property
def inactivity_length(self):
r"""Length of the inactivity mask."""
return self._inactivity_length
@inactivity_length.setter
def inactivity_length(self, value):
self._inactivity_length = value
# Discard current mask (will be recreated upon access if needed)
self._mask = None
# Set switch to False if None
if value is None:
self._mask_inactivity = False
[docs]
def create_inactivity_mask(self, duration):
"""Create a mask for inactivity (count equal to zero) periods.
This mask has the same length as its underlying data and can be used
to obfuscate inactive periods where the actimeter has most likely been
removed.
Warning: use a sufficiently long duration in order not to mask sleep
periods.
A minimal duration corresponding to two hours seems reasonable.
Parameters
----------
duration: int or str
Minimal number of consecutive zeroes for an inactive period.
Time offset strings (ex: '90min') can also be used.
"""
if isinstance(duration, int):
nepochs = duration
elif isinstance(duration, str):
nepochs = int(pd.Timedelta(duration) / self.frequency)
else:
nepochs = None
warnings.warn(
"Inactivity length must be a int and time offset string (ex: "
"'90min'). Could not create a mask.",
UserWarning,
)
# Store requested mask duration (and discard current mask)
self.inactivity_length = nepochs
# Create actual mask
self.mask = _create_inactivity_mask(self.activity, nepochs, 1)
[docs]
def add_mask_period(self, start, stop):
"""Add a period to the inactivity mask
Parameters
----------
start: str
Start time (YYYY-MM-DD HH:MM:SS) of the inactivity period.
stop: str
Stop time (YYYY-MM-DD HH:MM:SS) of the inactivity period.
"""
# Check if a mask has already been created
# NB : if the inactivity_length is not None, accessing the mask will
# trigger its creation.
if self.inactivity_length is None:
self.inactivity_length = -1
# self.mask = pd.Series(
# np.ones(self.length()),
# index=self.data.index
# )
# Check if start and stop are within the index range
if pd.Timestamp(start) < self.mask.index[0]:
raise ValueError(
(
"Attempting to set the start time of a mask period before "
+ "the actual start time of the data.\n"
+ "Mask start time: {}".format(start)
+ "Data start time: {}".format(self.mask.index[0])
)
)
if pd.Timestamp(stop) > self.mask.index[-1]:
raise ValueError(
(
"Attempting to set the stop time of a mask period after "
+ "the actual stop time of the data.\n"
+ "Mask stop time: {}".format(stop)
+ "Data stop time: {}".format(self.mask.index[-1])
)
)
# Set mask values between start and stop to zeros
# self.mask.loc[start:stop] = 0
self.mask = self.mask.mask(
(self.mask.index >= start) & (self.mask.index <= stop), 0
)
[docs]
def add_mask_periods(self, input_fname, *args, **kwargs):
"""Add periods to the inactivity mask
Function to read start and stop times from a Mask log file. Supports
different file format (.ods, .xls(x), .csv).
Parameters
----------
input_fname: str
Path to the log file.
*args
Variable length argument list passed to the subsequent reader
function.
**kwargs
Arbitrary keyword arguments passed to the subsequent reader
function.
"""
# Convert the log file into a DataFrame
absname, log = BaseLog.from_file(input_fname, "Mask", *args, **kwargs)
# Iterate over the rows of the DataFrame
for _, row in log.iterrows():
self.add_mask_period(row["Start_time"], row["Stop_time"])