Source code for pythologist.measurements.spatial.nearestneighbors

import pandas as pd
import sys
from pythologist.measurements import Measurement
import numpy as np
from scipy.spatial.distance import cdist
[docs]class NearestNeighbors(Measurement): @staticmethod def _preprocess_dataframe(cdf,*args,**kwargs): #step_pixels = kwargs['step_pixels'] #max_distance_pixels = kwargs['max_distance_pixels'] def _mindist_nodiag(pts1,pts2): mat = cdist(list(pts1),list(pts2)) if len(pts1)==len(pts2) and set(pts1.index) == set(pts2.index): np.fill_diagonal(mat,np.nan) dmat = pd.DataFrame(mat) # remove if they are all nan worst = pd.DataFrame(mat).isna().all(1) dmat.loc[worst]=999999999 mat = np.array(dmat) #print(mat) matmin = np.nanargmin(mat,axis=1).astype(float) matmin[worst] = np.nan #print(matmin) data = [[pts1.index[i],pts1.iloc[i],np.nan,np.nan,np.nan] if np.isnan(y) else (pts1.index[i],pts1.iloc[i],pts2.index[int(y)],pts2.iloc[int(y)],mat[i,int(y)]) \ for i,y in enumerate(matmin)] data = pd.DataFrame(data,columns=['cell_index','cell_coord','neighbor_cell_index','neighbor_cell_coord','minimum_distance_pixels']) return data.dropna() def _combine_dfs(minima,index,index_names): n1 = minima n1['_key'] = 1 n2 = pd.DataFrame(index,index=index_names).T n2['_key'] = 1 return n2.merge(n1,on='_key').drop(columns='_key') cdf = cdf.copy() if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("read phenotype label\n") mr = cdf.get_measured_regions().drop(columns='region_area_pixels') #cdf['phenotype_label'] = cdf.apply(lambda x: # [k for k,v in x['phenotype_calls'].items() if v==1] # ,1).apply(lambda x: np.nan if len(x)==0 else x[0]) cdf['phenotype_label'] = cdf['phenotype_calls'].\ apply(lambda x: dict((v,k) for k, v in x.items())).\ apply(lambda x: np.nan if 1 not in x else x[1]) phenotypes = cdf['phenotype_label'].unique() if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("get all coordinates\n") cdf['coord'] = cdf.apply(lambda x: (x['x'],x['y']),1) if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("get all coord pairs\n") cdf = cdf.groupby(list(mr.columns)+['phenotype_label']).apply(lambda x: pd.Series(dict(zip( ['cell_index','coordinates'], [list(x['cell_index']),list(x['coord'])] ))) ).reset_index() if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("set up comparisons points\n") cdf = cdf.merge(cdf.rename(columns={'cell_index':'neighbor_cell_index', 'coordinates':'neighbor_coordinates', 'phenotype_label':'neighbor_phenotype_label'}), on = list(mr.columns)) if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("get minima\n") cdf = cdf.set_index(list(mr.columns)+['phenotype_label','neighbor_phenotype_label']).\ apply(lambda x: _mindist_nodiag(pd.Series(x['coordinates'],index=x['cell_index']), pd.Series(x['neighbor_coordinates'],index=x['neighbor_cell_index'])) ,1) inames = cdf.index.names cdf = cdf.reset_index().rename(columns={0:'cdist'}).set_index(inames) if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("combine data\n") cdf = cdf.apply(lambda x: _combine_dfs(x['cdist'],x.name,cdf.index.names),1) return pd.concat(cdf.tolist()) def _distance(self,mergeon,minimum_edges): mr = self.measured_regions[mergeon].drop_duplicates().copy() mr['_key'] = 1 mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes}) mp['_key'] = 1 mn = pd.DataFrame({'neighbor_phenotype_label':self.measured_phenotypes}) mn['_key'] = 1 data = mr.merge(mp,on='_key').merge(mn,on='_key').drop(columns='_key') fdata = self.groupby(mergeon+['phenotype_label','neighbor_phenotype_label']).\ apply(lambda x: pd.Series(dict(zip( ['edge_count', 'mean_distance_pixels', 'mean_distance_um', 'stddev_distance_pixels', 'stddev_distance_um', 'stderr_distance_pixels', 'stderr_distance_um' ], [ len(x['minimum_distance_pixels']), x['minimum_distance_pixels'].mean(), x['minimum_distance_pixels'].mean()*self.microns_per_pixel, x['minimum_distance_pixels'].std(), x['minimum_distance_pixels'].std()*self.microns_per_pixel, x['minimum_distance_pixels'].std()/np.sqrt(len(x['minimum_distance_pixels'])), x['minimum_distance_pixels'].std()*self.microns_per_pixel/np.sqrt(len(x['minimum_distance_pixels'])) ] ))) ).reset_index() fdata.loc[fdata['edge_count']<minimum_edges,'mean_distance_pixels'] = np.nan fdata.loc[fdata['edge_count']<minimum_edges,'mean_distance_um'] = np.nan fdata.loc[fdata['edge_count']<minimum_edges,'stddev_distance_pixels'] = np.nan fdata.loc[fdata['edge_count']<minimum_edges,'stddev_distance_um'] = np.nan fdata.loc[fdata['edge_count']<minimum_edges,'stderr_distance_pixels'] = np.nan fdata.loc[fdata['edge_count']<minimum_edges,'stderr_distance_um'] = np.nan data = data.merge(fdata,on=list(data.columns),how='left') data['minimum_edges'] = minimum_edges return data def frame_distance(self,minimum_edges=20): mergeon=['project_id','project_name','sample_id','sample_name','frame_id','frame_name','region_label'] return self._distance(mergeon,minimum_edges) def _cummulative_sample_distance(self,minimum_edges=20): mergeon=['project_id','project_name','sample_id','sample_name','region_label'] data = self._distance(mergeon,minimum_edges).\ rename(columns={'edge_count':'cummulative_edge_count', 'mean_distance_pixels':'mean_cummulative_distance_pixels', 'mean_distance_um':'mean_cummulative_distance_um', 'stddev_distance_pixels':'stddev_cummulative_distance_pixels', 'stddev_distance_um':'stddev_cummulative_distance_um', 'stddev_distance_pixels':'stddev_cummulative_distance_pixels', 'stderr_distance_um':'stddev_cummulative_distance_um', }) return data def _mean_sample_distance(self,minimum_edges=20): mergeon=['project_id','project_name','sample_id','sample_name','region_label'] mr = self.measured_regions[mergeon+['frame_id','frame_name']].drop_duplicates().copy() mr = mr.groupby(mergeon).count()[['frame_id']].rename(columns={'frame_id':'frame_count'}).\ reset_index() mr['_key'] = 1 mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes}) mp['_key'] = 1 mn = pd.DataFrame({'neighbor_phenotype_label':self.measured_phenotypes}) mn['_key'] = 1 blank = mr.merge(mp,on='_key').merge(mn,on='_key').drop(columns='_key') data = self.frame_distance(minimum_edges).dropna() data = data.groupby(mergeon+['phenotype_label','neighbor_phenotype_label']).\ apply(lambda x: pd.Series(dict(zip( ['mean_mean_distance_pixels', 'mean_mean_distance_um', 'stddev_mean_distance_pixels', 'stddev_mean_distance_um', 'stderr_mean_distance_pixels', 'stderr_mean_distance_um', 'measured_frame_count' ], [ x['mean_distance_pixels'].mean(), x['mean_distance_um'].mean(), x['mean_distance_pixels'].std(), x['mean_distance_um'].std(), x['mean_distance_pixels'].std()/np.sqrt(len(x['mean_distance_pixels'])), x['mean_distance_um'].std()/np.sqrt(len(x['mean_distance_pixels'])), len(x['mean_distance_pixels']) ] ))) ).reset_index() data = blank.merge(data,on=mergeon+['phenotype_label','neighbor_phenotype_label'],how='left') return data def sample_distance(self,minimum_edges=20): mergeon=['project_id','project_name','sample_id','sample_name','region_label'] v1 = self._cummulative_sample_distance(minimum_edges) v2 = self._mean_sample_distance(minimum_edges) data = v1.merge(v2,on=mergeon+['phenotype_label','neighbor_phenotype_label']) data.loc[data['measured_frame_count'].isna(),'measured_frame_count'] = 0 return data def frame_proximity(self,threshold_um,phenotype): threshold = threshold_um/self.microns_per_pixel mergeon = self.cdf.frame_columns+['region_label'] df = self.loc[(self['neighbor_phenotype_label']==phenotype) ].copy() df.loc[df['minimum_distance_pixels']>=threshold,'location'] = 'far' df.loc[df['minimum_distance_pixels']<threshold,'location'] = 'near' df = df.groupby(mergeon+['phenotype_label','neighbor_phenotype_label','location']).count()[['cell_index']].\ rename(columns={'cell_index':'count'}).reset_index()[mergeon+['phenotype_label','location','count']] mr = self.measured_regions[mergeon].copy() mr['_key'] = 1 mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes}) mp['_key'] = 1 total = df.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).reset_index() blank = mr.merge(mp,on='_key').merge(total,on=mergeon).drop(columns='_key') df = blank.merge(df,on=mergeon+['location','phenotype_label'],how='left') df.loc[(~df['total'].isna())&(df['count'].isna()),'count'] =0 df['fraction'] = df.apply(lambda x: x['count']/x['total'],1) df = df.sort_values(mergeon+['location','phenotype_label']) return df def sample_proximity(self,threshold_um,phenotype): mergeon = self.cdf.sample_columns+['region_label'] fp = self.frame_proximity(threshold_um,phenotype) cnt = fp.groupby(mergeon+['phenotype_label','location']).sum()[['count']].reset_index() total = cnt.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).\ reset_index() cnt = cnt.merge(total,on=mergeon+['location']).sort_values(mergeon+['location','phenotype_label']) cnt['fraction'] = cnt.apply(lambda x: x['count']/x['total'],1) return cnt def project_proximity(self,threshold_um,phenotype): mergeon = self.cdf.project_columns+['region_label'] fp = self.sample_proximity(threshold_um,phenotype) cnt = fp.groupby(mergeon+['phenotype_label','location']).sum()[['count']].reset_index() total = cnt.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).\ reset_index() cnt = cnt.merge(total,on=mergeon+['location']).sort_values(mergeon+['location','phenotype_label']) cnt['fraction'] = cnt.apply(lambda x: x['count']/x['total'],1) return cnt def threshold(self,phenotype,proximal_label,distance_um=None,distance_pixels=None): def _add_score(d,value,label): d[label] = int(value) return d # for the given phenotype, define whether all cell is within # distance_um of a neighboring cell with that phenotype if distance_um is not None and distance_pixels is None: distance_pixels = distance_um/self.microns_per_pixel nn1 = self.loc[self['neighbor_phenotype_label']==phenotype].copy() nn1['_threshold'] = np.nan nn1.loc[(nn1['minimum_distance_pixels']<distance_pixels),'_threshold'] = 1 nn1.loc[(nn1['minimum_distance_pixels']>=distance_pixels),'_threshold'] = 0 output = self.cdf.copy() mergeon = output.frame_columns+['region_label','cell_index'] cdf = output.merge(nn1[mergeon+['_threshold']],on=mergeon) cdf['scored_calls'] = cdf.apply(lambda x: _add_score(x['scored_calls'],x['_threshold'],proximal_label) ,1) cdf.microns_per_pixel = self.microns_per_pixel return cdf.drop(columns='_threshold') def bin_fractions_from_neighbor(self,neighbor_phenotype,numerator_phenotypes,denominator_phenotypes, bin_size_microns=20, minimum_total_count=0, group_strategy=['project_name','sample_name']): # set our bin size in microns mynn = self.loc[self['neighbor_phenotype_label']==neighbor_phenotype].copy() mynn['minimum_distance_microns'] = mynn['minimum_distance_pixels'].apply(lambda x: x*self.cdf.microns_per_pixel) rngs = np.arange(0,mynn['minimum_distance_microns'].max(),bin_size_microns) mynn['bins'] = pd.cut(mynn['minimum_distance_microns'],bins=rngs) numerator = mynn.loc[mynn['phenotype_label'].isin(numerator_phenotypes)] denominator = mynn.loc[mynn['phenotype_label'].isin(denominator_phenotypes)] numerator = numerator.groupby(group_strategy+['bins']).count()[['cell_index']].rename(columns={'cell_index':'cell_count'}).reset_index() numerator['group'] = 'numerator' denominator = denominator.groupby(group_strategy+['bins']).count()[['cell_index']].rename(columns={'cell_index':'cell_count'}).reset_index() denominator['group'] = 'total' sub = pd.concat([numerator,denominator]) sub = sub.set_index(group_strategy+['bins']).pivot(columns='group') sub.columns = sub.columns.droplevel(0) sub = sub.reset_index() sub['fraction'] = sub['numerator'].divide(sub['total']) sub.loc[sub['numerator'].isna(),'numerator']=0 sub.loc[sub['total'].isna(),'total']=0 sub['right']=[int(x.right) for x in sub['bins'].tolist()] sub.loc[sub['total']<minimum_total_count,'fraction']=np.nan return sub