import h5py, json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
[docs]class GoodNeighbors(object):
"""
Class for executing and storing the neighborhood analysis
Properties:
:microns_per_pixel: (int) set in initialization
:radius: (float) pixel size of radius, set by set_neighborhood_fractions via the radius_um (in microns)
:groupby: (list) of cells fields who as a group can identify image frames
:h5path: (str) location of the h5path storing things
:location: (str) subdirectory in the h5object
"""
def __init__(self,h5path,location='',mode='r',microns_per_pixel=np.nan):
self.h5path = h5path
self.mode = mode
self.location = location
self._distance = None
if mode in ['w','r+','a']:
f = h5py.File(self.h5path,mode)
if location != '': f.create_group(location+'/cells')
dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str))
dset.attrs['microns_per_pixel'] = microns_per_pixel
dset.attrs['groupby'] = json.dumps([])
dset.attrs['radius'] = np.nan
#dset.attrs['id'] = uuid4().hex
f.close()
def set_TSNE(self,sample=5000,*args,**kwargs):
if not self.mode in ['w','r+','a']: raise ValueError('cant write for readonly')
dsdata = self.neighborhood_fraction_matrix().sample(n=sample)
tsne = TSNE(n_components=2,perplexity=6,n_iter=3000).fit_transform(dsdata)
tsne = pd.DataFrame(tsne,columns=['x','y'])
tsne.index = dsdata.index
tsne = tsne.reset_index().merge(self.kmeans_cluster[['cluster_id','k','cell_index']+self.groupby],
on=self.groupby+['cell_index'])
return tsne
@property
def kmeans_cluster(self):
return pd.read_hdf(self.h5path,self.location+'/cells/clusters')
def set_kmeans_cluster(self,k):
if not self.mode in ['w','r+','a']: raise ValueError('cant write for readonly')
km, clusters = self._cluster_kmeans(k)
clusters['k'] = k
clusters = clusters.merge(self.cells.drop(columns='phenotype_label'),on=self.groupby+['cell_index'])
clusters.to_hdf(self.h5path,self.location+'/cells/clusters',
mode='r+',format='table',complib='zlib',complevel=9)
def _cluster_kmeans(self,k):
fmat = self.neighborhood_fraction_matrix()
km = KMeans(n_clusters=k).fit(fmat)
cids = km.labels_
index_labels = list(set(list(self.neighborhood_fractions.columns))-set(['n_phenotype_label','fraction']))
clusters = fmat.reset_index()[index_labels]
clusters['cluster_id'] = cids
clusters.index.name = 'db_id'
clusters.columns = clusters.columns.droplevel(1)
return km, clusters
def test_kmeans(self,kmin=2,kmax=30):
kdata = []
for k in range(2,12):
km, clusters = self._cluster_kmeans(k)
# how how much of the population does the smallest cluster account for
accounted = clusters.groupby('cluster_id')[['cell_index']].count().\
rename(columns={'cell_index':'count'}).reset_index()
smallest = accounted.apply(lambda x: x['count']/clusters.shape[0],1).min()
kdata.append([k,km.inertia_,smallest])
kdata = pd.DataFrame(kdata,columns=['k','inertia','smallest_cluster_fraction'])
return kdata
def neighborhood_fraction_matrix(self):
index_labels = list(set(list(self.neighborhood_fractions.columns))-set(['n_phenotype_label','fraction']))
fmat = self.neighborhood_fractions.set_index(index_labels).pivot(columns='n_phenotype_label')
return fmat
@property
def neighborhood_fractions(self):
return pd.read_hdf(self.h5path,self.location+'/cells/fractions')
def set_neighborhood_fractions(self,radius_um):
self.radius = radius_um/self.microns_per_pixel
dist = self.measure_distance()
focus = dist.loc[dist['distance']<radius]
totals = focus.groupby(self.groupby+['phenotype_label','cell_index']).count()[['db_id']].\
rename(columns={'db_id':'total'}).reset_index()
indiv = focus.groupby(self.groupby+['phenotype_label','cell_index','n_phenotype_label']).count()[['db_id']].\
rename(columns={'db_id':'count'}).reset_index()
fracs = totals.merge(indiv,on=self.groupby+['phenotype_label','cell_index'])
fracs['fraction'] = fracs.apply(lambda x: x['count']/x['total'],1)
fracs = fracs.set_index(self.groupby+['phenotype_label','cell_index','total'])\
[['n_phenotype_label','fraction']].\
pivot(columns='n_phenotype_label')
fracs.columns = fracs.columns.droplevel(0)
fracs = fracs.fillna(0).stack().reset_index().rename(columns={0:'fraction'})
fracs.to_hdf(self.h5path,self.location+'/cells/fractions',
mode='r+',format='table',complib='zlib',complevel=9)
fracs['radius'] = radius
return
[docs] def plot_counts_per_radius(self,min_radius=0,max_radius=150,step_radius=2):
"""
"""
dist = self.measure_distance()
max_distance = 150
#radius = 71
mpp = 0.496
census = []
for i in range(1,min(max_distance,int(dist['distance'].max())),step_radius):
one = dist.loc[dist['distance']<i].\
groupby(['db_id']).count()[['n_db_id']].\
rename(columns={'n_db_id':'count'}).reset_index()[['count']].\
quantile([0.05,0.25,0.5,0.75,0.95],0)
one.index.name = 'quantile'
one = one.reset_index()
one['distance'] = i
one = one.fillna(0)
census.append(one)
census = pd.concat(census).pivot(columns='quantile',index='distance',values='count')
census.columns = [str(x) for x in census.columns]
census = census.reset_index()
return census
[docs] def measure_distance(self):
"""
Calculate the distance and store it in the cache
"""
if self._distance is not None: return self._distance
one = self.cells.copy().reset_index(drop=True)
one.index.name = 'db_id'
one = one.reset_index()
dist = one.merge(one.rename(columns={'x':'n_x','y':'n_y',
'phenotype_label':'n_phenotype_label',
'cell_index':'n_cell_index','db_id':'n_db_id'}),on=self.groupby)
dist = dist.loc[dist['db_id']!=dist['n_db_id']]
xdist = dist['x'].subtract(dist['n_x'])
ydist = dist['y'].subtract(dist['n_y'])
dist['distance'] = np.sqrt(xdist.multiply(xdist).add(ydist.multiply(ydist)))
self._distance = dist
return dist
@property
def radius(self):
f = h5py.File(self.h5path,'r')
levels = (self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
return d.attrs['radius']
@radius.setter
def radius(self,value):
if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly')
f = h5py.File(self.h5path,'r+')
levels =(self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
d.attrs['radius'] = value
return
@property
def microns_per_pixel(self):
f = h5py.File(self.h5path,'r')
levels = (self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
return d.attrs['microns_per_pixel']
@microns_per_pixel.setter
def microns_per_pixel(self,value):
if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly')
f = h5py.File(self.h5path,'r+')
levels =(self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
d.attrs['microns_per_pixel'] = value
return
@property
def groupby(self):
f = h5py.File(self.h5path,'r')
levels = (self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
return json.loads(d.attrs['groupby'])
@groupby.setter
def groupby(self,value):
if not self.mode in ['a','r+','w']: raise ValueError('cant write on readonly')
f = h5py.File(self.h5path,'r+')
levels =(self.location+'/meta').split('/')
d = f
for level in levels:
if level == '': continue
d = d[level]
d.attrs['groupby'] = json.dumps(value)
return
@property
def cells(self):
"""
Access the pandas.DataFrame stored in the h5 object and return it
Returns:
(pandas.DataFrame)
"""
if self._cells is not None: return self._cells.copy()
f = h5py.File(self.h5path,'r')
names = [x for x in f]
if self.location != '':
d = f
levels = self.location.split('/')
for i, level in enumerate(levels):
if level == '': continue
d = d[level]
names = [x for x in f]
f.close()
if 'cells' not in names: return None
self._cells = pd.read_hdf(self.h5path,self.location+'/cells')
return self._cells.copy()
[docs] def set_cells(self,df,groupby):
"""
Define the cells as a pandas.DataFrame
Inputs:
:df: (pandas.DataFrame) Must have columns defined ``['cell_index','x','y','phenotype_label']``
:groupby: (list) List of fields that will group the pandas.DataFrame by image frame
"""
if self.mode not in ['w','r+','a']: raise ValueError("Cannot set data in read-only")
self.groupby = groupby
required = ['cell_index','x','y','phenotype_label']
for r in required:
if r not in df.columns: raise ValuError("Cell dataframe input needs at least "+str(required))
df = pd.DataFrame(df.loc[:,required+groupby].dropna(subset=['phenotype_label']))
#return df
self._cells = df.copy()
self._cells.to_hdf(self.h5path,self.location+'/cells',
mode='r+',format='table',complib='zlib',complevel=9)