Source code for pythologist.measurements.spatial.nearestneighbors

import pandas as pd
import sys
from pythologist.measurements import Measurement
import numpy as np
from scipy.spatial.distance import cdist
[docs]class NearestNeighbors(Measurement):
    @staticmethod
    def _preprocess_dataframe(cdf,*args,**kwargs):
        #step_pixels = kwargs['step_pixels']
        #max_distance_pixels = kwargs['max_distance_pixels']
        def _mindist_nodiag(pts1,pts2):
            mat = cdist(list(pts1),list(pts2))
            if len(pts1)==len(pts2) and set(pts1.index) == set(pts2.index): 
                np.fill_diagonal(mat,np.nan)
            dmat = pd.DataFrame(mat)
            # remove if they are all nan
            worst = pd.DataFrame(mat).isna().all(1)
            dmat.loc[worst]=999999999
            mat = np.array(dmat)
            #print(mat)
            matmin = np.nanargmin(mat,axis=1).astype(float)
            matmin[worst] = np.nan
            #print(matmin)
            data = [[pts1.index[i],pts1.iloc[i],np.nan,np.nan,np.nan] if np.isnan(y) else
                    (pts1.index[i],pts1.iloc[i],pts2.index[int(y)],pts2.iloc[int(y)],mat[i,int(y)]) \
                        for i,y in enumerate(matmin)]
            data = pd.DataFrame(data,columns=['cell_index','cell_coord','neighbor_cell_index','neighbor_cell_coord','minimum_distance_pixels'])
            return data.dropna()
        def _combine_dfs(minima,index,index_names):
            n1 = minima
            n1['_key'] = 1
            n2 = pd.DataFrame(index,index=index_names).T
            n2['_key'] = 1
            return n2.merge(n1,on='_key').drop(columns='_key')
        cdf = cdf.copy()
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("read phenotype label\n")
        mr = cdf.get_measured_regions().drop(columns='region_area_pixels')
        #cdf['phenotype_label'] = cdf.apply(lambda x: 
        #        [k for k,v in x['phenotype_calls'].items() if v==1]
        #    ,1).apply(lambda x: np.nan if len(x)==0 else x[0])
        cdf['phenotype_label'] = cdf['phenotype_calls'].\
            apply(lambda x: dict((v,k) for k, v in x.items())).\
            apply(lambda x: np.nan if 1 not in x else x[1])
        phenotypes = cdf['phenotype_label'].unique()
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("get all coordinates\n")
        cdf['coord'] = cdf.apply(lambda x: (x['x'],x['y']),1)
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("get all coord pairs\n")
        cdf = cdf.groupby(list(mr.columns)+['phenotype_label']).apply(lambda x: 
            pd.Series(dict(zip(
                ['cell_index','coordinates'],
                [list(x['cell_index']),list(x['coord'])]            
            )))
        ).reset_index()
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("set up comparisons points\n")
        cdf = cdf.merge(cdf.rename(columns={'cell_index':'neighbor_cell_index',
                                            'coordinates':'neighbor_coordinates',
                                            'phenotype_label':'neighbor_phenotype_label'}),
                       on = list(mr.columns))
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("get minima\n")
        cdf = cdf.set_index(list(mr.columns)+['phenotype_label','neighbor_phenotype_label']).\
            apply(lambda x: 
                    _mindist_nodiag(pd.Series(x['coordinates'],index=x['cell_index']),
                                    pd.Series(x['neighbor_coordinates'],index=x['neighbor_cell_index']))
            ,1)
        inames = cdf.index.names
        cdf  = cdf.reset_index().rename(columns={0:'cdist'}).set_index(inames)
        if 'verbose'  in kwargs and kwargs['verbose']: sys.stderr.write("combine data\n")
        cdf = cdf.apply(lambda x: _combine_dfs(x['cdist'],x.name,cdf.index.names),1)
        return pd.concat(cdf.tolist())    
    def _distance(self,mergeon,minimum_edges):
        mr = self.measured_regions[mergeon].drop_duplicates().copy()
        mr['_key'] = 1
        mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes})
        mp['_key'] = 1
        mn = pd.DataFrame({'neighbor_phenotype_label':self.measured_phenotypes})
        mn['_key'] = 1
        data = mr.merge(mp,on='_key').merge(mn,on='_key').drop(columns='_key')
        fdata = self.groupby(mergeon+['phenotype_label','neighbor_phenotype_label']).\
            apply(lambda x: 
                pd.Series(dict(zip(
                    ['edge_count',
                     'mean_distance_pixels',
                     'mean_distance_um',
                     'stddev_distance_pixels',
                     'stddev_distance_um',
                     'stderr_distance_pixels',
                     'stderr_distance_um'
                    ],
                    [
                      len(x['minimum_distance_pixels']),
                      x['minimum_distance_pixels'].mean(),
                      x['minimum_distance_pixels'].mean()*self.microns_per_pixel,
                      x['minimum_distance_pixels'].std(),
                      x['minimum_distance_pixels'].std()*self.microns_per_pixel,
                      x['minimum_distance_pixels'].std()/np.sqrt(len(x['minimum_distance_pixels'])),
                      x['minimum_distance_pixels'].std()*self.microns_per_pixel/np.sqrt(len(x['minimum_distance_pixels']))
                    ]
           )))
        ).reset_index()
        fdata.loc[fdata['edge_count']<minimum_edges,'mean_distance_pixels'] = np.nan
        fdata.loc[fdata['edge_count']<minimum_edges,'mean_distance_um'] = np.nan
        fdata.loc[fdata['edge_count']<minimum_edges,'stddev_distance_pixels'] = np.nan
        fdata.loc[fdata['edge_count']<minimum_edges,'stddev_distance_um'] = np.nan
        fdata.loc[fdata['edge_count']<minimum_edges,'stderr_distance_pixels'] = np.nan
        fdata.loc[fdata['edge_count']<minimum_edges,'stderr_distance_um'] = np.nan
        data = data.merge(fdata,on=list(data.columns),how='left')
        data['minimum_edges'] = minimum_edges
        return data
    def frame_distance(self,minimum_edges=20):
        mergeon=['project_id','project_name','sample_id','sample_name','frame_id','frame_name','region_label']
        return self._distance(mergeon,minimum_edges)
    def _cummulative_sample_distance(self,minimum_edges=20):
        mergeon=['project_id','project_name','sample_id','sample_name','region_label']
        data = self._distance(mergeon,minimum_edges).\
            rename(columns={'edge_count':'cummulative_edge_count',
                            'mean_distance_pixels':'mean_cummulative_distance_pixels',
                            'mean_distance_um':'mean_cummulative_distance_um',
                            'stddev_distance_pixels':'stddev_cummulative_distance_pixels',
                            'stddev_distance_um':'stddev_cummulative_distance_um',
                            'stddev_distance_pixels':'stddev_cummulative_distance_pixels',
                            'stderr_distance_um':'stddev_cummulative_distance_um',
                           })
        return data
    def _mean_sample_distance(self,minimum_edges=20):
        mergeon=['project_id','project_name','sample_id','sample_name','region_label']
        mr = self.measured_regions[mergeon+['frame_id','frame_name']].drop_duplicates().copy()
        mr = mr.groupby(mergeon).count()[['frame_id']].rename(columns={'frame_id':'frame_count'}).\
            reset_index()
        mr['_key'] = 1
        mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes})
        mp['_key'] = 1
        mn = pd.DataFrame({'neighbor_phenotype_label':self.measured_phenotypes})
        mn['_key'] = 1
        blank = mr.merge(mp,on='_key').merge(mn,on='_key').drop(columns='_key')

        data = self.frame_distance(minimum_edges).dropna()
        data = data.groupby(mergeon+['phenotype_label','neighbor_phenotype_label']).\
            apply(lambda x:
                pd.Series(dict(zip(
                    ['mean_mean_distance_pixels',
                     'mean_mean_distance_um',
                     'stddev_mean_distance_pixels',
                     'stddev_mean_distance_um',
                     'stderr_mean_distance_pixels',
                     'stderr_mean_distance_um',
                     'measured_frame_count'
                    ],
                    [
                      x['mean_distance_pixels'].mean(),
                      x['mean_distance_um'].mean(),
                      x['mean_distance_pixels'].std(),
                      x['mean_distance_um'].std(),
                      x['mean_distance_pixels'].std()/np.sqrt(len(x['mean_distance_pixels'])),
                      x['mean_distance_um'].std()/np.sqrt(len(x['mean_distance_pixels'])),
                      len(x['mean_distance_pixels'])
                    ]
                )))
            ).reset_index()
        data = blank.merge(data,on=mergeon+['phenotype_label','neighbor_phenotype_label'],how='left')
        return data
    def sample_distance(self,minimum_edges=20):
        mergeon=['project_id','project_name','sample_id','sample_name','region_label']
        v1 = self._cummulative_sample_distance(minimum_edges)
        v2 = self._mean_sample_distance(minimum_edges)
        data = v1.merge(v2,on=mergeon+['phenotype_label','neighbor_phenotype_label'])
        data.loc[data['measured_frame_count'].isna(),'measured_frame_count'] = 0
        return data

    def frame_proximity(self,threshold_um,phenotype):
        threshold  = threshold_um/self.microns_per_pixel
        mergeon = self.cdf.frame_columns+['region_label']
        df = self.loc[(self['neighbor_phenotype_label']==phenotype)
                 ].copy()
        df.loc[df['minimum_distance_pixels']>=threshold,'location'] = 'far'
        df.loc[df['minimum_distance_pixels']<threshold,'location'] = 'near'
        df = df.groupby(mergeon+['phenotype_label','neighbor_phenotype_label','location']).count()[['cell_index']].\
            rename(columns={'cell_index':'count'}).reset_index()[mergeon+['phenotype_label','location','count']]
        mr = self.measured_regions[mergeon].copy()
        mr['_key'] = 1
        mp = pd.DataFrame({'phenotype_label':self.measured_phenotypes})
        mp['_key'] = 1
        total = df.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).reset_index()
        blank = mr.merge(mp,on='_key').merge(total,on=mergeon).drop(columns='_key')
        df = blank.merge(df,on=mergeon+['location','phenotype_label'],how='left')
        df.loc[(~df['total'].isna())&(df['count'].isna()),'count'] =0
        df['fraction'] = df.apply(lambda x: x['count']/x['total'],1)
        df = df.sort_values(mergeon+['location','phenotype_label'])
        return df
    def sample_proximity(self,threshold_um,phenotype):
        mergeon = self.cdf.sample_columns+['region_label']
        fp = self.frame_proximity(threshold_um,phenotype)
        cnt = fp.groupby(mergeon+['phenotype_label','location']).sum()[['count']].reset_index()
        total = cnt.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).\
             reset_index()
        cnt = cnt.merge(total,on=mergeon+['location']).sort_values(mergeon+['location','phenotype_label'])
        cnt['fraction'] = cnt.apply(lambda x: x['count']/x['total'],1)
        return cnt
    def project_proximity(self,threshold_um,phenotype):
        mergeon = self.cdf.project_columns+['region_label']
        fp = self.sample_proximity(threshold_um,phenotype)
        cnt = fp.groupby(mergeon+['phenotype_label','location']).sum()[['count']].reset_index()
        total = cnt.groupby(mergeon+['location']).sum()[['count']].rename(columns={'count':'total'}).\
             reset_index()
        cnt = cnt.merge(total,on=mergeon+['location']).sort_values(mergeon+['location','phenotype_label'])
        cnt['fraction'] = cnt.apply(lambda x: x['count']/x['total'],1)
        return cnt
    def threshold(self,phenotype,proximal_label,distance_um=None,distance_pixels=None):
        def _add_score(d,value,label):
            d[label] = int(value)
            return d
        # for the given phenotype, define whether all cell is within 
        # distance_um of a neighboring cell with that phenotype
        if distance_um is not None and distance_pixels is None:
            distance_pixels = distance_um/self.microns_per_pixel
        nn1 = self.loc[self['neighbor_phenotype_label']==phenotype].copy()
        nn1['_threshold'] = np.nan
        nn1.loc[(nn1['minimum_distance_pixels']<distance_pixels),'_threshold'] = 1
        nn1.loc[(nn1['minimum_distance_pixels']>=distance_pixels),'_threshold'] = 0
        output = self.cdf.copy()
        mergeon = output.frame_columns+['region_label','cell_index']
        cdf = output.merge(nn1[mergeon+['_threshold']],on=mergeon)
        cdf['scored_calls'] = cdf.apply(lambda x:
            _add_score(x['scored_calls'],x['_threshold'],proximal_label)
        ,1)
        cdf.microns_per_pixel = self.microns_per_pixel
        return cdf.drop(columns='_threshold')
    def bin_fractions_from_neighbor(self,neighbor_phenotype,numerator_phenotypes,denominator_phenotypes,
                                         bin_size_microns=20,
                                         minimum_total_count=0,
                                         group_strategy=['project_name','sample_name']):
        # set our bin size in microns
        mynn = self.loc[self['neighbor_phenotype_label']==neighbor_phenotype].copy()
        mynn['minimum_distance_microns'] = mynn['minimum_distance_pixels'].apply(lambda x: x*self.cdf.microns_per_pixel)
        rngs = np.arange(0,mynn['minimum_distance_microns'].max(),bin_size_microns)
        mynn['bins'] = pd.cut(mynn['minimum_distance_microns'],bins=rngs)
        numerator = mynn.loc[mynn['phenotype_label'].isin(numerator_phenotypes)]
        denominator = mynn.loc[mynn['phenotype_label'].isin(denominator_phenotypes)]

        numerator = numerator.groupby(group_strategy+['bins']).count()[['cell_index']].rename(columns={'cell_index':'cell_count'}).reset_index()
        numerator['group'] = 'numerator'
        denominator = denominator.groupby(group_strategy+['bins']).count()[['cell_index']].rename(columns={'cell_index':'cell_count'}).reset_index()
        denominator['group'] = 'total'
        sub = pd.concat([numerator,denominator])
        sub = sub.set_index(group_strategy+['bins']).pivot(columns='group')
        sub.columns = sub.columns.droplevel(0)
        sub = sub.reset_index()
        sub['fraction'] = sub['numerator'].divide(sub['total'])
        sub.loc[sub['numerator'].isna(),'numerator']=0
        sub.loc[sub['total'].isna(),'total']=0
        sub['right']=[int(x.right) for x in sub['bins'].tolist()]
        sub.loc[sub['total']<minimum_total_count,'fraction']=np.nan
        return sub