Source code for goodneighbors

import h5py, json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

[docs]class GoodNeighbors(object): """ Class for executing and storing the neighborhood analysis Properties: :microns_per_pixel: (int) set in initialization :radius: (float) pixel size of radius, set by set_neighborhood_fractions via the radius_um (in microns) :groupby: (list) of cells fields who as a group can identify image frames :h5path: (str) location of the h5path storing things :location: (str) subdirectory in the h5object """ def __init__(self,h5path,location='',mode='r',microns_per_pixel=np.nan): self.h5path = h5path self.mode = mode self.location = location self._distance = None if mode in ['w','r+','a']: f = h5py.File(self.h5path,mode) if location != '': f.create_group(location+'/cells') dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str)) dset.attrs['microns_per_pixel'] = microns_per_pixel dset.attrs['groupby'] = json.dumps([]) dset.attrs['radius'] = np.nan #dset.attrs['id'] = uuid4().hex f.close() def set_TSNE(self,sample=5000,*args,**kwargs): if not self.mode in ['w','r+','a']: raise ValueError('cant write for readonly') dsdata = self.neighborhood_fraction_matrix().sample(n=sample) tsne = TSNE(n_components=2,perplexity=6,n_iter=3000).fit_transform(dsdata) tsne = pd.DataFrame(tsne,columns=['x','y']) tsne.index = dsdata.index tsne = tsne.reset_index().merge(self.kmeans_cluster[['cluster_id','k','cell_index']+self.groupby], on=self.groupby+['cell_index']) return tsne @property def kmeans_cluster(self): return pd.read_hdf(self.h5path,self.location+'/cells/clusters') def set_kmeans_cluster(self,k): if not self.mode in ['w','r+','a']: raise ValueError('cant write for readonly') km, clusters = self._cluster_kmeans(k) clusters['k'] = k clusters = clusters.merge(self.cells.drop(columns='phenotype_label'),on=self.groupby+['cell_index']) clusters.to_hdf(self.h5path,self.location+'/cells/clusters', mode='r+',format='table',complib='zlib',complevel=9) def _cluster_kmeans(self,k): fmat = self.neighborhood_fraction_matrix() km = KMeans(n_clusters=k).fit(fmat) cids = km.labels_ index_labels = list(set(list(self.neighborhood_fractions.columns))-set(['n_phenotype_label','fraction'])) clusters = fmat.reset_index()[index_labels] clusters['cluster_id'] = cids clusters.index.name = 'db_id' clusters.columns = clusters.columns.droplevel(1) return km, clusters def test_kmeans(self,kmin=2,kmax=30): kdata = [] for k in range(2,12): km, clusters = self._cluster_kmeans(k) # how how much of the population does the smallest cluster account for accounted = clusters.groupby('cluster_id')[['cell_index']].count().\ rename(columns={'cell_index':'count'}).reset_index() smallest = accounted.apply(lambda x: x['count']/clusters.shape[0],1).min() kdata.append([k,km.inertia_,smallest]) kdata = pd.DataFrame(kdata,columns=['k','inertia','smallest_cluster_fraction']) return kdata def neighborhood_fraction_matrix(self): index_labels = list(set(list(self.neighborhood_fractions.columns))-set(['n_phenotype_label','fraction'])) fmat = self.neighborhood_fractions.set_index(index_labels).pivot(columns='n_phenotype_label') return fmat @property def neighborhood_fractions(self): return pd.read_hdf(self.h5path,self.location+'/cells/fractions') def set_neighborhood_fractions(self,radius_um): self.radius = radius_um/self.microns_per_pixel dist = self.measure_distance() focus = dist.loc[dist['distance']<radius] totals = focus.groupby(self.groupby+['phenotype_label','cell_index']).count()[['db_id']].\ rename(columns={'db_id':'total'}).reset_index() indiv = focus.groupby(self.groupby+['phenotype_label','cell_index','n_phenotype_label']).count()[['db_id']].\ rename(columns={'db_id':'count'}).reset_index() fracs = totals.merge(indiv,on=self.groupby+['phenotype_label','cell_index']) fracs['fraction'] = fracs.apply(lambda x: x['count']/x['total'],1) fracs = fracs.set_index(self.groupby+['phenotype_label','cell_index','total'])\ [['n_phenotype_label','fraction']].\ pivot(columns='n_phenotype_label') fracs.columns = fracs.columns.droplevel(0) fracs = fracs.fillna(0).stack().reset_index().rename(columns={0:'fraction'}) fracs.to_hdf(self.h5path,self.location+'/cells/fractions', mode='r+',format='table',complib='zlib',complevel=9) fracs['radius'] = radius return
[docs] def plot_counts_per_radius(self,min_radius=0,max_radius=150,step_radius=2): """ """ dist = self.measure_distance() max_distance = 150 #radius = 71 mpp = 0.496 census = [] for i in range(1,min(max_distance,int(dist['distance'].max())),step_radius): one = dist.loc[dist['distance']<i].\ groupby(['db_id']).count()[['n_db_id']].\ rename(columns={'n_db_id':'count'}).reset_index()[['count']].\ quantile([0.05,0.25,0.5,0.75,0.95],0) one.index.name = 'quantile' one = one.reset_index() one['distance'] = i one = one.fillna(0) census.append(one) census = pd.concat(census).pivot(columns='quantile',index='distance',values='count') census.columns = [str(x) for x in census.columns] census = census.reset_index() return census
[docs] def measure_distance(self): """ Calculate the distance and store it in the cache """ if self._distance is not None: return self._distance one = self.cells.copy().reset_index(drop=True) one.index.name = 'db_id' one = one.reset_index() dist = one.merge(one.rename(columns={'x':'n_x','y':'n_y', 'phenotype_label':'n_phenotype_label', 'cell_index':'n_cell_index','db_id':'n_db_id'}),on=self.groupby) dist = dist.loc[dist['db_id']!=dist['n_db_id']] xdist = dist['x'].subtract(dist['n_x']) ydist = dist['y'].subtract(dist['n_y']) dist['distance'] = np.sqrt(xdist.multiply(xdist).add(ydist.multiply(ydist))) self._distance = dist return dist
@property def radius(self): f = h5py.File(self.h5path,'r') levels = (self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] return d.attrs['radius'] @radius.setter def radius(self,value): if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly') f = h5py.File(self.h5path,'r+') levels =(self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] d.attrs['radius'] = value return @property def microns_per_pixel(self): f = h5py.File(self.h5path,'r') levels = (self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] return d.attrs['microns_per_pixel'] @microns_per_pixel.setter def microns_per_pixel(self,value): if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly') f = h5py.File(self.h5path,'r+') levels =(self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] d.attrs['microns_per_pixel'] = value return @property def groupby(self): f = h5py.File(self.h5path,'r') levels = (self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] return json.loads(d.attrs['groupby']) @groupby.setter def groupby(self,value): if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly') f = h5py.File(self.h5path,'r+') levels =(self.location+'/meta').split('/') d = f for level in levels: if level == '': continue d = d[level] d.attrs['groupby'] = json.dumps(value) return @property def cells(self): """ Access the pandas.DataFrame stored in the h5 object and return it Returns: (pandas.DataFrame) """ if self._cells is not None: return self._cells.copy() f = h5py.File(self.h5path,'r') names = [x for x in f] if self.location != '': d = f levels = self.location.split('/') for i, level in enumerate(levels): if level == '': continue d = d[level] names = [x for x in f] f.close() if 'cells' not in names: return None self._cells = pd.read_hdf(self.h5path,self.location+'/cells') return self._cells.copy()
[docs] def set_cells(self,df,groupby): """ Define the cells as a pandas.DataFrame Inputs: :df: (pandas.DataFrame) Must have columns defined ``['cell_index','x','y','phenotype_label']`` :groupby: (list) List of fields that will group the pandas.DataFrame by image frame """ if self.mode not in ['w','r+','a']: raise ValueError("Cannot set data in read-only") self.groupby = groupby required = ['cell_index','x','y','phenotype_label'] for r in required: if r not in df.columns: raise ValuError("Cell dataframe input needs at least "+str(required)) df = pd.DataFrame(df.loc[:,required+groupby].dropna(subset=['phenotype_label'])) #return df self._cells = df.copy() self._cells.to_hdf(self.h5path,self.location+'/cells', mode='r+',format='table',complib='zlib',complevel=9)