import pandas as pd
[docs]def concurrent_cagetreatment(df, cagestays,
protect_duplicates=[
'Animal_id',
'Cage_id',
'Cage_Treatment_start_date',
'Cage_Treatment_end_date',
'Cage_TreatmentProtocol_code',
'Treatment_end_date',
'Treatment_end_date',
'TreatmentProtocol_code',
],
):
"""
Return a `pandas.DataFrame` object containing only `Cage_Treatment*` entries which are concurrent with the animal stay in the cage to which they were administered.
Parameters
----------
df : pandas.DataFrame
Pandas Dataframe, with columns containing:
`Animal_id`,
`Animal_death_date`,
`CageStay_start_date`,
`Cage_Treatment_start_date`,
`Cage_TreatmentProtocol_code`.
cagestays : pandas.DataFrame
Pandas Dataframe, with columns containing:
`Animal_id`,
`CageStay_end_date`,
`CageStay_start_date`,
Notes
-----
This function checks whether cage-level treatment onsets indeed happened during the period in which the animal was housed in the cage.
We do not check for the treatment end dates, as an animal which has received a partial treatment has received a treatment.
Checks for treatment discontinuation due to e.g. death should be performed elsewhere.
"""
drop_idx = []
for subject in list(df['Animal_id'].unique()):
stay_starts = df[df['Animal_id']==subject]['CageStay_start_date'].tolist()
# The per-animal treatment info is recorded in each table row, but if the animal only has one cage stay without a cage treatment, it will be deleted, taking the animal treatment information with it.
# We avoid this here:
blank_cells_only = False
if len(stay_starts) == 1:
if df.loc[df['Animal_id']==subject, 'TreatmentProtocol_code'].item() != None:
blank_cells_only = True
for stay_start in stay_starts:
stay_end = cagestays[(cagestays['Animal_id']==subject)&(cagestays['CageStay_start_date']==stay_start)]['CageStay_end_date'].tolist()[0]
treatment_start = df[(df['Animal_id']==subject)&(df['CageStay_start_date']==stay_start)]['Cage_Treatment_start_date'].tolist()[0]
death_date = df[df['Animal_id']==subject]['Animal_death_date'].tolist()[0]
# We do not check for treatment end dates, because often you may want to include recipients of incomplete treatments (e.g. due to death) when filtering based on cagestays.
# Filtering based on death should be done elsewhere.
if treatment_start <= stay_start or treatment_start >= stay_end or treatment_start >= death_date:
if blank_cells_only:
df.loc[df['Animal_id']==subject, ['Cage_TreatmentProtocol_code', 'Cage_Treatment_start_date', 'Cage_Treatment_end_date', 'Cage_Treatment_protocol_id']] = None
else:
drop_idx.extend(df[(df['Animal_id']==subject)&(df['CageStay_start_date']==stay_start)].index.tolist())
df = df.drop(drop_idx)
#df = df.drop_duplicates(subset=protect_duplicates)
return df
[docs]def collapse_rename(df, groupby, collapse,
rename=False,
):
"""
Collapse long form columns according to a lambda function, so that groupby column values are rendered unique
Parameters
----------
df : pandas.DataFrame
A `pandas.DataFrame` object which you want to collapse.
groupby : string
The name of a column from `df`, the values of which you want to render unique.
collapse : dict
A dictionary the keys of which are columns you want to collapse, and the values of which are lambda functions instructing how to collapse (e.g. concatenate) the values.
rename : dict, optional
A dictionary the keys of which are names of columns from `df`, and the values of which are new names for these columns.
"""
df = df.groupby(groupby).agg(collapse)
if rename:
df = df.rename(columns=rename)
return df
[docs]def relativize_dates(df,
date_suffix='_date',
rounding='D',
rounding_type='round',
reference_date=True,
):
"""
Express dates on each row of a Pandas Dataframe as datetime objects relative to the row value on the 'reference_date' column.
Parameters
----------
df : pandas.DataFrame
Pandas Dataframe, with columns containing 'reference_date' and strings ending in `date_suffix`.
date_suffix : str, optional
String sufix via which to identify date columns needing manipulation.
rounding : str, optional
Datetime increment for date rounding.
rounding_type : {'round','floor','ceil'}, optional
Whether to round the dates (splits e.g. days apart at noon, hours at 30 minutes, etc.) or to take the floor or the ceiling.
"""
if isinstance(reference_date, bool) and reference_date:
df['reference_date'] = df['Cage_Treatment_start_date']
elif isinstance(reference_date, str):
df['reference_date'] = df[reference_date]
date_columns = [i for i in df.columns.tolist() if i.endswith(date_suffix)]
for date_column in date_columns:
try:
df[date_column] = df[date_column]-df['reference_date']
except TypeError:
pass
else:
if rounding:
start = pd.to_datetime('1970-01-01')
df[date_column] = df[date_column] + start
if rounding_type == 'round':
df[date_column] = df[date_column].dt.round(rounding)
elif rounding_type == 'floor':
df[date_column] = df[date_column].dt.floor(rounding)
elif rounding_type == 'ceil':
df[date_column] = df[date_column].dt.ceil(rounding)
return df