Source code for pythologist

import pandas as pd
import numpy as np
import sys, json, h5py
from pythologist.selection import SubsetLogic
from pythologist.measurements.counts import PercentageLogic
from pythologist.measurements.counts import Counts
from pythologist.measurements.spatial.contacts import Contacts
from pythologist.measurements.spatial.nearestneighbors import NearestNeighbors
from pythologist.measurements.spatial.cartesian import Cartesian
from pythologist.interface import SegmentationImages, phenotypes_to_regions as interface_phenotypes_to_regions
from pythologist.qc import QC

class CellDataSeries(pd.Series):
    @property
    def _constructor(self):
        return CellDataSeries
    @property
    def _constructor_expanddim(self):
        return CellDataFrame
    

[docs]class CellDataFrame(pd.DataFrame): """ The **CellDataFrame** class is an extension of a pandas.DataFrame with per-cell rows that have region, binary calls, mutually exclusive phenotypes, cell locations, and cell-cell contact. Params: microns_per_pixel (float): conversion factor that gets saved along with the dataframe once its set. (20x vectra is a 0.496) db (CellProject): a storage class that has all the image and mask data """ _metadata = ['_microns_per_pixel','_db'] # for extending dataframe to include this property @property def _constructor(self): return CellDataFrame @property def _constructor_sliced(self): return CellDataSeries @property def _constructor_expanddim(self): return CellDataFrame def __init__(self,*args,**kw): kwcopy = kw.copy() super(CellDataFrame,self).__init__(*args,**kwcopy)
[docs] def get_valid_cell_indecies(self): """ Return a dataframe of images present with 'valid' being a list of cell indecies that can be included """ return pd.DataFrame(self).groupby(self.frame_columns).apply(lambda x: list(x['cell_index'])).\ reset_index().rename(columns={0:'valid'})
[docs] def prune_neighbors(self): """ If the CellDataFrame has been subsetted, some of the cell-cell contacts may no longer be part of the the dataset. This prunes those no-longer existant connections. Returns: CellDataFrame: A CellDataFrame with only valid cell-cell contacts """ def _neighbor_check(neighbors,valid): if not neighbors==neighbors: return np.nan valid_keys = set(valid)&set(neighbors.keys()) d = dict([(k,v) for k,v in neighbors.items() if k in valid_keys]) return d fixed = self.copy() valid = self.get_valid_cell_indecies() valid = pd.DataFrame(self).merge(valid,on=self.frame_columns).set_index(self.frame_columns+['cell_index']) valid = valid.apply(lambda x: _neighbor_check(x['neighbors'],x['valid']),1).reset_index().\ rename(columns={0:'new_neighbors'}) fixed = fixed.merge(valid,on=self.frame_columns+['cell_index']).drop(columns='neighbors').\ rename(columns={'new_neighbors':'neighbors'}) fixed.microns_per_pixel = self.microns_per_pixel fixed.db = self.db #fixed.loc[:,'neighbors'] = list(new_neighbors) return fixed
@property def frame_columns(self): """ Returns a list of fields suitable for identifying the unique image frames """ return ['project_id','project_name', 'sample_id','sample_name', 'frame_id','frame_name'] @property def sample_columns(self): """ Returns a list of fields suitable for identifying the unique samples """ return ['project_id','project_name', 'sample_id','sample_name'] @property def project_columns(self): """ Returns a list of fields suitable for identifying the unique projects """ return ['project_id','project_name']
[docs] def to_hdf(self,path,key,mode='a'): """ Save the CellDataFrame to an hdf5 file. Args: path (str): the path to save to key (str): the name of the location to save it to mode (str): write mode """ pd.DataFrame(self.serialize()).to_hdf(path,key,mode=mode,format='table',complib='zlib',complevel=9) f = h5py.File(path,'r+') f[key].attrs["microns_per_pixel"] = float(self.microns_per_pixel) if self.microns_per_pixel is not None else np.nan f.close()
[docs] def phenotypes_to_scored(self,phenotypes=None,overwrite=False): """ Add mutually exclusive phenotypes to the scored calls Args: phenotypes (list): a list of phenotypes to add to scored calls. if none or not set, add them all overwrite (bool): if True allow the overwrite of a phenotype, if False, the phenotype must not exist in the scored calls Returns: CellDataFrame """ if not self.is_uniform(): raise ValueError("inconsistent phenotypes") if phenotypes is None: phenotypes = self.phenotypes elif isinstance(phenotypes,str): phenotypes = [phenotypes] def _post(binary,phenotype_label,phenotypes,overwrite): d = binary.copy() if len(set(phenotypes)&set(list(binary.keys()))) > 0 and overwrite==False: raise ValueError("Error, phenotype already exists as a scored type") for label in phenotypes: d[label] = 0 if phenotype_label == phenotype_label and phenotype_label in phenotypes: d[phenotype_label] = 1 return d output = self.copy() output['scored_calls'] = output.apply(lambda x: _post(x['scored_calls'],x['phenotype_label'],phenotypes,overwrite) ,1) return output
[docs] @classmethod def concat(self,array_like): """ Concatonate multiple CellDataFrames throws an error if the microns_per_pixel is not uniform across the frames Args: array_like (list): a list of CellDataFrames with 1 or more CellDataFrames Returns: CellDataFrame """ arr = list(array_like) if len(set([x.microns_per_pixel for x in arr])) != 1: raise ValueError("Multiple microns per pixel set") cdf = CellDataFrame(pd.concat([pd.DataFrame(x) for x in arr])) cdf.microns_per_pixel = arr[0].microns_per_pixel return cdf
[docs] @classmethod def read_hdf(cls,path,key=None): """ Read a CellDataFrame from an hdf5 file. Args: path (str): the path to read from key (str): the name of the location to read from Returns: CellDataFrame """ df = pd.read_hdf(path,key) df['scored_calls'] = df['scored_calls'].apply(lambda x: json.loads(x)) df['channel_values'] = df['channel_values'].apply(lambda x: json.loads(x)) df['regions'] = df['regions'].apply(lambda x: json.loads(x)) df['phenotype_calls'] = df['phenotype_calls'].apply(lambda x: json.loads(x)) df['neighbors'] = df['neighbors'].apply(lambda x: json.loads(x)) df['neighbors'] = df['neighbors'].apply(lambda x: np.nan if not isinstance(x,dict) else dict(zip([int(y) for y in x.keys()],x.values())) ) df['frame_shape'] = df['frame_shape'].apply(lambda x: tuple(json.loads(x))) df = cls(df) f = h5py.File(path,'r') mpp = f[key].attrs["microns_per_pixel"] if not np.isnan(mpp): df.microns_per_pixel = mpp f.close() return df
[docs] def serialize(self): """ Convert the data to one that can be saved in h5 structures Returns: pandas.DataFrame: like a cell data frame but serialized. columns """ df = self.copy() df['scored_calls'] = df['scored_calls'].apply(lambda x: json.dumps(x)) df['channel_values'] = df['channel_values'].apply(lambda x: json.dumps(x)) df['regions'] = df['regions'].apply(lambda x: json.dumps(x)) df['phenotype_calls'] = df['phenotype_calls'].apply(lambda x: json.dumps(x)) df['neighbors'] = df['neighbors'].apply(lambda x: json.dumps(x)) df['frame_shape'] = df['frame_shape'].apply(lambda x: json.dumps(x)) return df
@property def microns_per_pixel(self): """ Read or store the micron's per pixel (float) value by reading or asigning to this """ if not hasattr(self,'_microns_per_pixel'): return None return self._microns_per_pixel @microns_per_pixel.setter def microns_per_pixel(self,value): self._microns_per_pixel = value
[docs] def is_uniform(self,verbose=True): """ Check to make sure phenotype calls, or scored calls are consistent across all images / samples """ uni = pd.Series(self['phenotype_calls'].apply(lambda x: json.dumps(x)).unique()).\ apply(lambda x: json.loads(x)).apply(lambda x: tuple(sorted(x.keys()))).unique() if len(uni) > 1: if verbose: sys.stderr.write("WARNING: phenotypes differ across the dataframe \n"+str(uni)+"\n") return False uni = pd.Series(self['scored_calls'].apply(lambda x: json.dumps(x)).unique()).\ apply(lambda x: json.loads(x)).apply(lambda x: tuple(sorted(x.keys()))).unique() if len(uni) > 1: if verbose: sys.stderr.write("WARNING: scored_calls differ across the dataframe \n"+str(uni)+"\n") return False return True
@property def db(self): """ Assign to this or read from this, the CellProject storage object """ if not hasattr(self,'_db'): return None return self._db @db.setter def db(self,db): self._db = db @property def phenotypes(self): """ Return the list of phenotypes present """ return _extract_unique_keys_from_series(self['phenotype_calls']) @property def scored_names(self): """ Return the list of binary feature names """ return _extract_unique_keys_from_series(self['scored_calls']) @property def regions(self): """ Return the list of region names """ return _extract_unique_keys_from_series(self['regions'])
[docs] def get_measured_regions(self): """ Returns: pandas.DataFrame: Output a dataframe with regions and region sizes """ mergeon = ['project_id','project_name', 'sample_id','sample_name', 'frame_id','frame_name', ] temp = self.loc[:,mergeon+['regions']].\ set_index(mergeon)['regions'].apply(json.dumps).\ reset_index().drop_duplicates() temp['regions'] = temp['regions'].apply(json.loads) rows = [] for i,r in temp.iterrows(): for label in r['regions']: a = list(r.index) b = list(r.values) a = a+['region_label','region_area_pixels'] b = b+[label,r['regions'][label]] rows.append(dict(zip(a,b))) rows = pd.DataFrame(rows).drop(columns='regions').\ drop_duplicates()[mergeon+['region_label','region_area_pixels']] #rows = rows.loc[rows['region_area_pixels']>0].copy() return rows
[docs] def segmentation_images(self,*args,**kwargs): """ Use the segmented images to create per-image graphics Args: verbose (bool): output more details if true Returns: SegmentationImages: returns a class used to construct the image graphics """ if not self.db: raise ValueError("Need to set db") segs = SegmentationImages.read_cellframe(self,*args,**kwargs) segs.microns_per_pixel = segs.microns_per_pixel return segs
[docs] def nearestneighbors(self,*args,**kwargs): """ Use the segmented images to create per-image graphics Args: verbose (bool): output more details if true measured_regions (pandas.DataFrame): explicitly list the measured images and regions measured_phenotypes (list): explicitly list the phenotypes present Returns: NearestNeighbors: returns a class that holds nearest neighbor information for whatever phenotypes were in the CellDataFrame before execution. This class is suitable for nearest neighbor and proximity operations. """ n = NearestNeighbors.read_cellframe(self,*args,**kwargs) if 'measured_regions' in kwargs: n.measured_regions = kwargs['measured_regions'] else: n.measured_regions = self.get_measured_regions() if 'measured_phenotypes' in kwargs: n.measured_phenotypes = kwargs['measured_phenotypes'] else: n.measured_phenotypes = self.phenotypes n.microns_per_pixel = self.microns_per_pixel return n
[docs] def contacts(self,*args,**kwargs): """ Use assess the cell-to-cell contacts recorded in the celldataframe Returns: Contacts: returns a class that holds cell-to-cell contact information for whatever phenotypes were in the CellDataFrame before execution. """ n = Contacts.read_cellframe(self,prune_neighbors=True) if 'measured_regions' in kwargs: n.measured_regions = kwargs['measured_regions'] else: n.measured_regions = self.get_measured_regions() if 'measured_phenotypes' in kwargs: n.measured_phenotypes = kwargs['measured_phenotypes'] else: n.measured_phenotypes = self.phenotypes n.microns_per_pixel = self.microns_per_pixel return n
[docs] def cartesian(self,subsets=None,step_pixels=100,max_distance_pixels=150,*args,**kwargs): """ Return a class that can be used to create honeycomb plots Args: subsets (list): list of SubsetLogic objects step_pixels (int): distance between hexagons max_distance_pixels (int): the distance from each point by which to caclulate the quanitty of the phenotype for that area Returns: Cartesian: returns a class that holds the layout of the points to plot. """ n = Cartesian.read_cellframe(self,subsets=subsets,step_pixels=step_pixels,max_distance_pixels=max_distance_pixels,prune_neighbors=False,*args,**kwargs) if 'measured_regions' in kwargs: n.measured_regions = kwargs['measured_regions'] else: n.measured_regions = self.get_measured_regions() if 'measured_phenotypes' in kwargs: n.measured_phenotypes = kwargs['measured_phenotypes'] else: n.measured_phenotypes = self.phenotypes n.microns_per_pixel = self.microns_per_pixel return n
[docs] def counts(self,*args,**kwargs): """ Return a class that can be used to access count densities Args: measured_regions (pandas.DataFrame): Dataframe of regions that are being measured (defaults to all the regions) measured_phenotypes (list): List of phenotypes present (defaults to all the phenotypes) minimum_region_size_pixels (int): Minimum region size to calculate counts on in pixels (Default: 1) Returns: Counts: returns a class that holds the counts. """ n = Counts.read_cellframe(self,prune_neighbors=False) if 'measured_regions' in kwargs: n.measured_regions = kwargs['measured_regions'] else: n.measured_regions = self.get_measured_regions() if 'measured_phenotypes' in kwargs: n.measured_phenotypes = kwargs['measured_phenotypes'] else: n.measured_phenotypes = self.phenotypes n.microns_per_pixel = self.microns_per_pixel if 'minimum_region_size_pixels' in kwargs: n.minimum_region_size_pixels = kwargs['minimum_region_size_pixels'] else: n.minimum_region_size_pixels = 1 return n
[docs] def qc(self,*args,**kwargs): """ Return a class that can be used to access QC reports Returns: QC: returns a class that can be used to interrogate the QC. """ return QC(self,*args,**kwargs)
def _shuffle_ids(self): together = [] for frame_id in self['frame_id'].unique(): v1 = self.loc[self['frame_id']==frame_id,['cell_index']].copy().reset_index(drop=True) v2 = v1.copy().sample(frac=1) v1['next_index'] = list(v2.index) v1['frame_id'] = frame_id together.append(v1) return pd.concat(together) ### Modifying functions
[docs] def merge_scores(self,df_addition,reference_markers='all', addition_markers='all',on=['project_name','sample_name','frame_name','cell_index']): """ Combine CellDataFrames that differ by score composition Args: df_addition (CellDataFrame): The CellDataFrame to merge scores in from reference_markers (list): which scored call names to keep in the this object (default: all) addition_markers (list): which scored call names to merge in (default: all) on (list): the features to merge cells on Returns: CellDataFrame,CellDataFrame: returns a passing CellDataFrame where merge criteria were met and a fail CellDataFrame where merge criteria were not met. """ if isinstance(reference_markers, str): reference_markers = self.scored_names elif reference_markers is None: reference_markers = [] if isinstance(addition_markers, str): addition_markers = df_addition.scored_names elif addition_markers is None: addition_markers = [] df_addition = df_addition.copy() df_addition['_key'] = 1 df = self.merge(df_addition[['scored_calls','_key']+on].rename(columns={'scored_calls':'_addition'}), on = on, how = 'left' ) df['_sub1'] = df['scored_calls'].apply(lambda x: dict((k,x[k]) for k in reference_markers) ) df['_sub2'] = df['_addition'].apply(lambda x: dict({}) if x!=x else dict((k,x[k]) for k in addition_markers) # handle NaN where we fail to match properly treat as empty ) # combine the two dictionaries df['scored_calls'] = df.apply(lambda x: {**x['_sub1'],**x['_sub2']} ,1) df = df.drop(columns=['_sub1','_sub2','_addition']) df = df.drop(columns='_key').copy(),df[df['_key'].isna()].drop(columns='_key').copy() if self.microns_per_pixel: df[0].microns_per_pixel = self.microns_per_pixel if self.microns_per_pixel: df[1].microns_per_pixel = self.microns_per_pixel return df
[docs] def rename_scored_calls(self,change): """ Change the names of scored call names, input dictionary change with {<current name>:<new name>} format, new name must not already exist Args: change (dict): a dictionary of current name keys and new name values Returns: CellDataFrame: The CellDataFrame modified. """ output = self.copy() output['scored_calls'] = output.apply(lambda x: _dict_rename(x['scored_calls'],change) ,1) return output
[docs] def zero_fill_missing_phenotypes(self): """ Fill in missing phenotypes and scored types by listing any missing data as negative Returns: CellDataFrame: The CellDataFrame modified. """ if self.is_uniform(verbose=False): return self.copy() output = self.copy() def _do_fill(d,names): old_names = list(d.keys()) old_values = list(d.values()) missing = set(names)-set(old_names) return dict(zip(old_names+list(missing),old_values+([0]*len(missing)))) ## Need to make these uniform pnames = self.phenotypes output['phenotype_calls']= output.apply(lambda x: _do_fill(x['phenotype_calls'],pnames) ,1) return output
[docs] def zero_fill_missing_scores(self): """ Fill in missing phenotypes and scored types by listing any missing data as negative Returns: CellDataFrame: The CellDataFrame modified. """ if self.is_uniform(verbose=False): return self.copy() output = self.copy() def _do_fill(d,names): old_names = list(d.keys()) old_values = list(d.values()) missing = set(names)-set(old_names) return dict(zip(old_names+list(missing),old_values+([0]*len(missing)))) ## Need to make these uniform pnames = self.scored_names output['scored_calls']= output.apply(lambda x: _do_fill(x['scored_calls'],pnames) ,1) return output
[docs] def drop_scored_calls(self,names): """ Take a name or list of scored call names and drop those from the scored calls Args: names (list): list of names to drop or a single string name to drop Returns: CellDataFrame: The CellDataFrame modified. """ def _remove(calls,names): d = dict([(k,v) for k,v in calls.items() if k not in names]) return d if isinstance(names, str): names = [names] output = self.copy() output['scored_calls'] = output['scored_calls'].\ apply(lambda x: _remove(x,names)) return output
[docs] def subset(self,logic,update=False): """ subset create a specific phenotype based on a logic, logic is a 'SubsetLogic' class, take union of all the phenotypes listed. If none are listed use all phenotypes. take the intersection of all the scored calls. Args: logic (SubsetLogic): A subsetlogic object to slice on update (bool): (default False) change the name of the phenotype according to the label in the subset logic Returns: CellDataFrame: The CellDataFrame modified. """ pnames = self.phenotypes snames = self.scored_names data = self.copy() values = [] phenotypes = logic.phenotypes if len(phenotypes)==0: phenotypes = pnames removing = set(self.phenotypes)-set(phenotypes) for k in phenotypes: if k not in pnames: raise ValueError("phenotype must exist in defined") temp = data.loc[data['phenotype_calls'].apply(lambda x: x[k]==1)].copy() if len(removing) > 0 and temp.shape[0] > 0: temp['phenotype_calls'] = temp.apply(lambda x: dict([(k,v) for k,v in x['phenotype_calls'].items() if k not in removing]) ,1) values.append(temp) data = pd.concat(values) for k,v in logic.scored_calls.items(): if k not in snames: raise ValueError("Scored name must exist in defined") myfilter = 0 if v == '-' else 1 data = data.loc[data['scored_calls'].apply(lambda x: x[k]==myfilter)] data.microns_per_pixel = self.microns_per_pixel if update: data['phenotype_calls'] = data['phenotype_calls'].apply(lambda x: {logic.label:1}) data.fill_phenotype_label(inplace=True) data.db = self.db return data
[docs] def threshold(self,phenotype,scored_name,positive_label=None,negative_label=None): """ Split a phenotype on a scored_call and if no label is specified use the format '<phenotype> <scored_call><+/->' to specify a label give the positive and negative label Args: phenotype (str): name of the phenotype to threshold scored_name (str): scored call name to apply value from positive_label (str): name to apply for positive lable (default: <phenotype> <scored_call>+) negative_label (str): name to apply for negative lable (default: <phenotype> <scored_call>-) Returns: CellDataFrame: The CellDataFrame modified. """ if positive_label is None and negative_label is not None or \ negative_label is None and positive_label is not None: raise ValueError("Error if you want to specify labels, give both positive and negative") if phenotype not in self.phenotypes: raise ValueError("Error phenotype "+str(phenotype)+" is not in the data.") if scored_name not in self.scored_names: raise ValueError("Error scored_name "+str(scored_name)+" is not in the data.") if positive_label is None and negative_label is None: positive_label = phenotype+' '+scored_name+'+' negative_label = phenotype+' '+scored_name+'-' elif positive_label == negative_label: raise ValueError("Cant have the same label for positive and negative.") def _swap_in(d,pheno,scored,phenotype_calls,scored_calls,pos,neg): if pheno not in phenotype_calls.keys(): return d keepers = [(k,v) for k,v in phenotype_calls.items() if k!=phenotype] if scored not in scored_calls.keys(): raise ValueError("Error scored calls are not unified across samples") scored_value = scored_calls[scored] phenotype_value = phenotype_calls[pheno] if phenotype_value == 0: keepers += [(pos,0),(neg,0)] elif scored_value == 1: keepers += [(pos,1),(neg,0)] elif scored_value == 0: keepers += [(pos,0),(neg,1)] else: raise ValueError("Format error. These values should only ever be zero or one.") return dict(keepers) data = self.copy() data['phenotype_calls'] = self.apply(lambda x: _swap_in(x,phenotype,scored_name,x['phenotype_calls'],x['scored_calls'],positive_label,negative_label) ,1) def _set_label(d): vals = [k for k,v in d.items() if v==1] return np.nan if len(vals) == 0 else vals[0] data['phenotype_label'] = data.apply(lambda x: _set_label(x['phenotype_calls']) ,1) return data.copy()
[docs] def collapse_phenotypes(self,input_phenotype_labels,output_phenotype_label,verbose=True): """ Rename one or more input phenotypes to a single output phenotype Args: input_phenotype_labels (list): A str name or list of names to combine output_phenotype_label (list): A str name to change the phenotype names to verbose (bool): output more details Returns: CellDataFrame: The CellDataFrame modified. """ if isinstance(input_phenotype_labels,str): input_phenotype_labels = [input_phenotype_labels] bad_phenotypes = set(input_phenotype_labels)-set(self.phenotypes) if len(bad_phenotypes) > 0: raise ValueError("Error phenotype(s) "+str(bad_phenotypes)+" are not in the data.") data = self.copy() if len(input_phenotype_labels) == 0: return data def _swap_in(d,inputs,output): # Get the keys we need to merge together overlap = set(d.keys()).intersection(inputs) # if there are none to merge we're done already if len(overlap) == 0: return d keepers = [(k,v) for k,v in d.items() if k not in inputs] # combine anything thats not a keeper return dict(keepers+\ [(output_phenotype_label,max([d[x] for x in overlap]))]) data['phenotype_calls'] = data.apply(lambda x: _swap_in(x['phenotype_calls'],input_phenotype_labels,output_phenotype_label) ,1) def _set_label(d): vals = [k for k,v in d.items() if v==1] return np.nan if len(vals) == 0 else vals[0] data['phenotype_label'] = data.apply(lambda x: _set_label(x['phenotype_calls']),1) return data
[docs] def rename_phenotype(self,*args,**kwargs): """simple alias for collapse phenotypes""" return self.collapse_phenotypes(*args,**kwargs)
[docs] def combine_regions(self,input_region_labels,output_region_label,verbose=True): """ Combine/rename one or more input regions to a single output region Args: input_region_labels (list): A str name or list of names to combine output_region_label (list): A str name to change the phenotype names to verbose (bool): output more details Returns: CellDataFrame: The CellDataFrame modified. """ if isinstance(input_region_labels,str): input_region_labels = [input_region_labels] bad_regions = set(input_region_labels)-set(self.regions) if len(bad_regions) > 0: raise ValueError("Error regions(s) "+str(bad_regions)+" are not in the data.") data = self.copy() if len(input_region_labels) == 0: return data def _swap_in(d,inputs,output): # Get the keys we need to merge together overlap = set(d.keys()).intersection(inputs) # if there are none to merge we're done already if len(overlap) == 0: return d keepers = [(k,v) for k,v in d.items() if k not in inputs] # combine anything thats not a keeper return dict(keepers+\ [(output_region_label,sum([d[x] for x in overlap]))]) data['regions'] = data.apply(lambda x: _swap_in(x['regions'],input_region_labels,output_region_label) ,1) data.loc[data['region_label'].isin(input_region_labels),'region_label'] = output_region_label return data
[docs] def rename_region(self,*args,**kwargs): """simple alias for combine phenotypes""" return self.combine_regions(*args,**kwargs)
[docs] def fill_phenotype_label(self,inplace=False): """ Set the phenotype_label column according to our rules for mutual exclusion """ def _get_phenotype(d): vals = [k for k,v in d.items() if v == 1] return np.nan if len(vals) == 0 else vals[0] if inplace: if self.shape[0] == 0: return self self['phenotype_label'] = self.apply(lambda x: _get_phenotype(x['phenotype_calls']),1) return fixed = self.copy() if fixed.shape[0] == 0: return fixed fixed['phenotype_label'] = fixed.apply(lambda x: _get_phenotype(x['phenotype_calls']),1) return fixed
[docs] def fill_phenotype_calls(self,phenotypes=None,inplace=False): """ Set the phenotype_calls according to the phenotype names """ if phenotypes is None: phenotypes = list(self['phenotype_label'].unique()) def _get_calls(label,phenos): d = dict([(x,0) for x in phenos]) if label!=label: return d # np.nan case d[label] = 1 return d if inplace: self['phenotype_calls'] = self.apply(lambda x: _get_calls(x['phenotype_label'],phenotypes),1) return fixed = self.copy() fixed['phenotype_calls'] = fixed.apply(lambda x: _get_calls(x['phenotype_label'],phenotypes),1) return fixed
[docs] def phenotypes_to_regions(self,*args,**kwargs): """ Create a new Project where regions are replaced to be based on regions defined as phenotypes Args: path (str): Location to store a new hdf5 file containing a database update with new region images gaussian_sigma (float): the sigma parameter to the gaussian_filter function that says how much to 'blur' overwrite (bool): if True allows you to overwrite the path default (False) unset_label (str): A label to give regions that are unaccounted for project_name (str): the project name Returns: CellProject: The new cell project CellDataFrame: The updated cell project """ return interface_phenotypes_to_regions(self,*args,**kwargs)
[docs] def regions_to_scored(self,regions=[]): """ Covert the region calls to scored_calls Args: regions (list): a list of regions to use (default empty list will use all regions) """ if len(regions) == 0: regions = self.regions if not isinstance(regions,list): raise ValueError("ERROR: regions is a list input") def _get_calls(current,region_label,regions): d = current.copy() for region in regions: if region in d.keys(): raise ValueError("ERROR: cannot overwrite a scored call.") d[region] = 0 if region_label == region: d[region] =1 return d fixed = self.copy() fixed['scored_calls'] = fixed.apply(lambda x: _get_calls(x['scored_calls'],x['region_label'],regions),1) return fixed
[docs] def scored_to_phenotype(self,phenotypes): """ Convert binary pehnotypes to mutually exclusive phenotypes. If none of the phenotypes are set, then phenotype_label becomes nan If any of the phenotypes are multiply set then it throws a fatal error. Args: phenotypes (list): a list of scored_names to convert to phenotypes Returns: CellDataFrame """ def _apply_score(scored_calls,phenotypes): present = sorted(list(set(phenotypes)&set(scored_calls.keys()))) total = sum([scored_calls[x] for x in present]) if total > 1: raise ValueError("You cant extract phenotypes from scores if they are not mutually exclusive") if total == 0: return np.nan for label in present: if scored_calls[label] == 1: return label raise ValueError("Should have hit an exit criteria already") output = self.copy() output['phenotype_label'] = output.apply(lambda x: _apply_score(x['scored_calls'],phenotypes),1) # now update the phenotypes with these output['phenotype_calls'] = output.apply(lambda x: dict([(y,1 if x['phenotype_label']==y else 0) for y in phenotypes]) ,1) return output
[docs] def permute_phenotype_labels(self,phenotype_labels=None, random_state=None, group_strategy=['project_name','project_id','sample_name','sample_id','frame_name','frame_id']): """ Shuffle phenotype labels. Defaults to shuffleling all labels within a frame. Adjust this by modifying group_strategy. Args: phenotype_labels (list): a list of phenotype_labels to shuffle amongst eachother if None shuffle all random_state (int or numpy random state): pass to the pandas shuffle function group_strategy (list): variables to group by Returns: CellDataFrame """ if random_state is not None and isinstance(random_state, int): random_state = np.random.RandomState(random_state) # if phenotype_labels is None use all phenotype labels remember_order = self.index if phenotype_labels is None: phenotype_labels = self.phenotypes keep = self.loc[~self['phenotype_label'].isin(phenotype_labels)].copy() toshuffle = self.loc[self['phenotype_label'].isin(phenotype_labels)].copy() shuffled = [] for name, group in toshuffle.groupby(group_strategy): sub = group.copy() sub['phenotype_label'] = list(sub['phenotype_label'].sample(sub.shape[0],random_state=random_state).reset_index(drop=True)) sub = sub.fill_phenotype_calls() shuffled.append(sub) return CellDataFrame.concat([keep]+shuffled).loc[remember_order]
[docs] def threshold_on_mutually_exclusive_ordinal_labels(self,phenotype_label,ordinal_labels): """ If mutually exclusive ordinal labels are present among the scoring, you can threshold a phenotype on these labels. Args: phenotype_label (str): a phenotype_label split based on the ordinal labels ordinal_labels (list): the list of ordinal labels to split the phenotype label on Returns: CellDataFrame """ def convert_labels(scored_calls,phenotype_calls,phenotype_label,ordinal_labels): fix = {} for k,v in phenotype_calls.items(): if k != phenotype_label: fix[k] = v sanity_check = 0 for ordinal_label in ordinal_labels: fix[phenotype_label+' '+ordinal_label] = \ 1 if (phenotype_calls[phenotype_label]==1 and scored_calls[ordinal_label]==1) else 0 sanity_check += scored_calls[ordinal_label] if sanity_check != 1: raise ValueError("ordinal labels not mutually exclusive.") return fix ndf = self.copy() print(ndf.shape) ndf['phenotype_calls'] = ndf.apply(lambda x: convert_labels(x['scored_calls'],x['phenotype_calls'],phenotype_label,ordinal_labels) ,1) ndf = ndf.fill_phenotype_label() return ndf
[docs] def convert_cascading_scores_to_mutually_exclusive_ordinal_binary(self,cascading_scored_calls,ordinal_labels): """ If you have a cascade of scoring stored as binary calls, you can convert these to mutuallye exclusive binary calls for ordinal labels. Example is you have thresholds for 0/1, 1/2, and 2/3, you can convert these thresholds to mutually exclusive +/- for 0,1,2,3 Args: cascading_scored_calls (list): an ordered from lowest thresholds to greatest thresholds list of thresholds in scored_names ordinal_labels (list): the list of ordinal labels to split the phenotype label into Returns: CellDataFrame """ if len(ordinal_labels)-1!=len(cascading_scored_calls): raise ValueError("You need one more ordinal label than the cascading thresholds") def do_conv(x,cascading_scored_calls,ordinal_labels): orig = x.copy() fix = {} for k,v in orig.items(): if k not in cascading_scored_calls: fix[k] = v ordinal_label = ordinal_labels[0] # initialize ordinal labels to zero for label in ordinal_labels: fix[label] = 1 ordinal_label = ordinal_labels[0] for i,score_name in enumerate(cascading_scored_calls): # For each cascading score, see if there is something set to 1 thats greater # if there is, set the current ordinal to zero remaining = cascading_scored_calls[i:] #print([x[name] for name in remaining]) v = sum([x[name] for name in remaining]) if v == 0: fix[ordinal_labels[i+1]]=0 else: fix[ordinal_labels[i]] = 0 return fix ndf = self.copy() ndf['scored_calls'] = ndf['scored_calls'].\ apply(lambda x: do_conv(x,cascading_scored_calls,ordinal_labels)) return ndf
def _extract_unique_keys_from_series(s): uni = pd.Series(s.apply(lambda x: json.dumps(x)).unique()).\ apply(lambda x: json.loads(x)).apply(lambda x: set(sorted(x.keys()))) return sorted(list(set().union(*list(uni)))) def _dict_rename(old,change): new_keys = [x if x not in change else change[x] for x in old.keys()] return dict(zip(new_keys, old.values()))