import pandas as pd
import numpy as np
import h5py, os, json, sys, shutil
from uuid import uuid4
from pythologist_image_utilities import map_image_ids
from pythologist_reader.qc import QC
from pythologist import CellDataFrame
"""
These are classes to help deal with cell-level image data
"""
[docs]class CellFrameGeneric(object):
"""
A generic CellFrameData object
"""
def __init__(self):
self._processed_image_id = None
self._images = {} # Database of Images
self._id = uuid4().hex
self.frame_name = None
self.data_tables = {
'cells':{'index':'cell_index',
'columns':['x','y','phenotype_index',
'region_index']},
'cell_tags':{'index':'db_id',
'columns':['tag_index','cell_index']},
'cell_measurements':{'index':'measurement_index',
'columns':['cell_index','statistic_index','feature_index','channel_index','value']},
'measurement_features':{'index':'feature_index',
'columns':['feature_label']},
'measurement_channels':{'index':'channel_index',
'columns':['channel_label','channel_abbreviation','image_id']},
'measurement_statistics':{'index':'statistic_index',
'columns':['statistic_label']},
'phenotypes':{'index':'phenotype_index',
'columns':['phenotype_label']},
'segmentation_images':{'index':'db_id',
'columns':['segmentation_label','image_id']},
'regions':{'index':'region_index',
'columns':['region_label','region_size','image_id']},
'cell_interactions':{'index':'db_id',
'columns':['cell_index','neighbor_cell_index','pixel_count','touch_distance']},
'tags':{'index':'tag_index',
'columns':['tag_label']}
}
self._data = {} # Do not acces directly. Use set_data_table and get_data_table to access.
for x in self.data_tables.keys():
self._data[x] = pd.DataFrame(columns=self.data_tables[x]['columns'])
self._data[x].index.name = self.data_tables[x]['index']
@property
def id(self):
"""
Returns the project UUID4
"""
return self._id
@property
def shape(self):
"""
Returns the (tuple) shape of the image (rows,columns)
"""
return self.processed_image.shape
@property
def processed_image_id(self):
"""
Returns (str) id of the frame object
"""
return self._processed_image_id
@property
def processed_image(self):
"""
Returns (numpy.array) of the processed_image
"""
return self._images[self._processed_image_id].copy()
[docs] def set_processed_image_id(self,image_id):
"""
Args:
image_id (str): set the id of the frame object
"""
self._processed_image_id = image_id
@property
def table_names(self):
"""
Return a list of data table names
"""
return list(self.data_tables.keys())
[docs] def set_data(self,table_name,table):
"""
Set the data table
Args:
table_name (str): the table name
table (pd.DataFrame): the input table
"""
# Assign data to the standard tables. Do some column name checking to make sure we are getting what we expect
if table_name not in self.data_tables: raise ValueError("Error table name doesn't exist in defined formats")
if set(list(table.columns)) != set(self.data_tables[table_name]['columns']): raise ValueError("Error column names don't match defined format\n"+\
str(list(table.columns))+"\n"+\
str(self.data_tables[table_name]['columns']))
if table.index.name != self.data_tables[table_name]['index']: raise ValueError("Error index name doesn't match defined format")
self._data[table_name] = table.loc[:,self.data_tables[table_name]['columns']].copy() # Auto-sort, and assign a copy so we aren't ever assigning by reference
[docs] def set_regions(self,regions,use_processed_region=True,unset_label='undefined',verbose=False):
"""
Alter the regions in the frame
Args:
regions (dict): a dictionary of mutually exclusive region labels and binary masks
if a region does not cover all the workable areas then it will be the only label
and the unused area will get the 'unset_label' as a different region
use_processed_region (bool): default True keep the processed region subtracted
unset_label (str): name of unset regions default (undefined)
"""
# delete our current regions
regions = regions.copy()
image_ids = list(self.get_data('mask_images')['image_id'])
image_ids = [x for x in image_ids if x != self.processed_image_id]
for image_id in image_ids: del self._images[image_id]
labels = list(regions.keys())
ids = [uuid4().hex for x in labels]
sizes = [regions[x].sum() for x in labels]
remainder = np.ones(self.processed_image.shape)
if use_processed_region: remainder = self.processed_image
for i,label in enumerate(labels):
my_image = regions[label]
if use_processed_region: my_image = my_image&self.processed_image
self._images[ids[i]] = my_image
remainder = remainder & (~my_image)
if verbose: sys.stderr.write("Remaining areas after setting are "+str(remainder.sum().sum())+"\n")
if remainder.sum().sum() > 0:
labels += [unset_label]
sizes += [remainder.sum().sum()]
ids += [uuid4().hex]
self._images[ids[-1]] = remainder
regions[unset_label] = remainder
regions2 = pd.DataFrame({'region_label':labels,
'region_size':sizes,
'image_id':ids
})
regions2.index.name = 'region_index'
self.set_data('regions',regions2)
def get_label(x,y,regions_dict):
for label in regions_dict:
if regions_dict[label][y][x] == 1: return label
return np.nan
raise ValueError("Coordinate is out of bounds for all regions.")
recode = self.get_data('cells').copy()
recode['new_region_label'] = recode.apply(lambda x: get_label(x['x'],x['y'],regions),1)
## see how many we need to drop because the centroid fall in an unprocessed region
if verbose: sys.stderr.write(str(recode.loc[recode['new_region_label'].isna()].shape[0])+" cells with centroids beyond the processed region are being dropped\n")
recode = recode.loc[~recode['new_region_label'].isna()].copy()
recode = recode.drop(columns='region_index').reset_index().\
merge(regions2[['region_label']].reset_index(),
left_on='new_region_label',right_on='region_label').\
drop(columns=['region_label','new_region_label']).set_index('cell_index')
self.set_data('cells',recode)
return
[docs] def get_data(self,table_name):
"""
Get the data table
Args:
table_name (pandas.DataFrame): the table you access by name
"""
return self._data[table_name].copy()
def read_hdf(self,h5file,location=''):
if location != '': location = location.split('/')
else: location = []
f = h5py.File(h5file,'r')
subgroup = f
for x in location:
subgroup = subgroup[x]
table_names = [x for x in subgroup['data']]
for table_name in table_names:
loc = '/'.join(location+['data',table_name])
#print(loc)
self.set_data(table_name,pd.read_hdf(h5file,loc))
# now get images
image_names = [x for x in subgroup['images']]
for image_name in image_names:
self._images[image_name] = np.array(subgroup['images'][image_name])
self.frame_name = subgroup['meta'].attrs['frame_name']
self._id = subgroup['meta'].attrs['id']
self.set_processed_image_id(subgroup['meta'].attrs['processed_image_id'])
return
def to_hdf(self,h5file,location='',mode='w'):
f = h5py.File(h5file,mode)
f.create_group(location+'/data')
f.create_group(location+'/images')
#f.create_group(location+'/meta')
f.close()
for table_name in self.data_tables.keys():
data_table = self.get_data(table_name)
data_table.to_hdf(h5file,
location+'/data/'+table_name,
mode='a',
format='table',
complib='zlib',
complevel=9)
f = h5py.File(h5file,'a')
for image_id in self._images.keys():
f.create_dataset(location+'/images/'+image_id,data=self._images[image_id],compression='gzip',compression_opts=9)
dset = f.create_dataset(location+'/meta', (100,), dtype=h5py.special_dtype(vlen=str))
dset.attrs['frame_name'] = self.frame_name
dset.attrs['processed_image_id'] = self.processed_image_id
dset.attrs['id'] = self._id
f.close()
[docs] def cell_map(self):
"""
Return a dataframe of cell ID's and locations
"""
if 'cell_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['cell_map','image_id']
return map_image_ids(self.get_image(cmid)).rename(columns={'id':'cell_index'})
[docs] def cell_map_image(self):
"""
Return a the image of cells by ID's
"""
if 'cell_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['cell_map','image_id']
return self.get_image(cmid)
[docs] def edge_map(self):
"""
Return a dataframe of cells by ID's of coordinates only on the edge of the cells
"""
if 'edge_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['edge_map','image_id']
return map_image_ids(self.get_image(cmid)).\
rename(columns={'id':'cell_index'})
[docs] def edge_map_image(self):
"""
Return an image of edges of integers by ID
"""
if 'edge_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['edge_map','image_id']
return self.get_image(cmid)
[docs] def segmentation_info(self):
"""
Return a dataframe with info about segmentation like cell areas and circumferences
"""
# handle the case where there is no edge data
if self.edge_map() is None:
return pd.DataFrame(index=self.get_data('cells').index,columns=['edge_pixels','area_pixels'])
return self.edge_map().reset_index().groupby(['cell_index']).count()[['x']].rename(columns={'x':'edge_pixels'}).\
merge(self.cell_map().reset_index().groupby(['cell_index']).count()[['x']].rename(columns={'x':'area_pixels'}),
left_index=True,
right_index=True).reset_index().set_index('cell_index')
[docs] def interaction_map(self):
"""
Returns:
pandas.DataFrame: return a dataframe of which cells are in contact with one another
"""
return self.get_data('cell_interactions')
[docs] def set_interaction_map(self,touch_distance=1):
"""
Measure the cell-cell contact interactions
Args:
touch_distance (int): optional default is 1 distance to look away from a cell for another cell
"""
full = self.cell_map()
edge = self.edge_map()
if full is None or edge is None: return None
d1 = edge.reset_index()
d1['key'] = 1
d2 = pd.DataFrame({'mod':[-1*touch_distance,0,touch_distance]})
d2['key'] = 1
d3 = d1.merge(d2,on='key').merge(d2,on='key')
d3['x'] = d3['x'].add(d3['mod_x'])
d3['y'] = d3['y'].add(d3['mod_y'])
d3 = d3[['x','y','cell_index','key']].rename(columns={'cell_index':'neighbor_cell_index'})
im = full.reset_index().merge(d3,on=['x','y']).\
query('cell_index!=neighbor_cell_index').\
drop_duplicates().groupby(['cell_index','neighbor_cell_index']).count()[['key']].reset_index().\
rename(columns={'key':'pixel_count'})
im['touch_distance'] = touch_distance
im.index.name='db_id'
self.set_data('cell_interactions',im)
@property
def thresholds(self):
raise ValueError('Override this to use it.')
[docs] def get_channels(self,all=False):
"""
Return a dataframe of the Channels
Args:
all (bool): default False if all is set to true will also include excluded channels (like autofluoresence)
Returns:
pandas.DataFrame: channel information
"""
if all: return self.get_data('measurement_channels')
d = self.get_data('measurement_channels')
return d.loc[~d['channel_label'].isin(self.excluded_channels)]
def get_regions(self):
return self.get_data('regions')
[docs] def get_raw(self,feature_label,statistic_label,all=False,channel_abbreviation=True):
"""
Get the raw data
Args:
feature_label (str): name of the feature
statistic_label (str): name of the statistic to extract
all (bool): default False if True put out everything including excluded channels
channel_abbreviation (bool): default True means use the abbreivations if available
Returns:
pandas.DataFrame: the dataframe
"""
stats = self.get_data('measurement_statistics').reset_index()
stats = stats.loc[stats['statistic_label']==statistic_label,'statistic_index'].iloc[0]
feat = self.get_data('measurement_features').reset_index()
feat = feat.loc[feat['feature_label']==feature_label,'feature_index'].iloc[0]
#region = self.get_data('regions').reset_index()
#region = region.loc[region['region_label']==region_label,'region_index'].iloc[0]
measure = self.get_data('cell_measurements')
measure = measure.loc[(measure['statistic_index']==stats)&(measure['feature_index']==feat)]
channels = self.get_data('measurement_channels')
if not all: channels = channels.loc[~channels['channel_label'].isin(self.excluded_channels)]
measure = measure.merge(channels,left_on='channel_index',right_index=True)
measure = measure.reset_index().pivot(index='cell_index',columns='channel_label',values='value')
if not channel_abbreviation: return measure
temp = dict(zip(self.get_data('measurement_channels')['channel_label'],
self.get_data('measurement_channels')['channel_abbreviation']))
return measure.rename(columns=temp)
def default_raw(self):
# override this
return None
def copy(self):
mytype = type(self)
them = mytype()
for x in self.data_tables.keys():
them._data[x] = self._data[x].copy()
return them
@property
def excluded_channels(self):
raise ValueError("Must be overridden")
[docs] def binary_calls(self):
"""
Return all the binary feature calls (alias)
"""
return phenotype_calls()
[docs] def phenotype_calls(self):
"""
Return all the binary feature calls
"""
phenotypes = self.get_data('phenotypes')['phenotype_label'].dropna().tolist()
temp = pd.DataFrame(index=self.get_data('cells').index,columns=phenotypes)
temp = temp.fillna(0)
temp = temp.merge(self.cell_df()[['phenotype_label']],left_index=True,right_index=True)
for phenotype in phenotypes:
temp.loc[temp['phenotype_label']==phenotype,phenotype]=1
return temp.drop(columns='phenotype_label').astype(np.int8)
def scored_calls(self):
# Must be overridden
return None
@property
def cdf(self):
"""
Return the pythologist.CellDataFrame of the frame
"""
# get our region sizes
region_sizes = self.get_data('regions').set_index('region_label')['region_size'].astype(int).to_dict()
# get our cells
temp1 = self.get_data('cells').drop(columns='phenotype_index').\
merge(self.get_data('regions'),
left_on='region_index',
right_index=True).drop(columns=['image_id','region_index','region_size'])
temp1['regions'] = temp1.apply(lambda x: region_sizes,1)
temp2 = self.scored_calls()
if temp2 is not None:
temp2 = temp2.apply(lambda x:
dict(zip(
list(x.index),
list(x)
))
,1).reset_index().rename(columns={0:'scored_calls'}).set_index('cell_index')
temp1 = temp1.merge(temp2,left_index=True,right_index=True)
else:
temp1['scored_calls'] = temp1.apply(lambda x: {},1)
temp3 = self.phenotype_calls().apply(lambda x:
dict(zip(
list(x.index),
list(x)
))
,1).reset_index().rename(columns={0:'phenotype_calls'}).set_index('cell_index')
temp1 = temp1.merge(temp3,left_index=True,right_index=True)
#temp1['phenotypes_present'] = json.dumps(list(
# sorted([x for x in self.get_data('phenotypes')['phenotype_label'] if x is not np.nan])
# ))
temp4 = None
# extract default values only if we have whole cell
if "Whole Cell" in self.get_data('measurement_features')['feature_label'].tolist():
temp4 = self.default_raw()
if temp4 is not None:
temp4 = temp4.apply(lambda x:
dict(zip(
list(x.index),
list(x)
))
,1).reset_index().rename(columns={0:'channel_values'}).set_index('cell_index')
temp1 = temp1.merge(temp4,left_index=True,right_index=True)
else:
temp1['channel_values'] = np.nan
#temp5 = self.interaction_map().groupby('cell_index').\
# apply(lambda x: json.dumps(list(sorted(x['neighbor_cell_index'])))).reset_index().\
# rename(columns={0:'neighbor_cell_index'}).set_index('cell_index')
# Get neighbor data .. may not be available for all cells
# Set a default of a null frame and only try and set if there are some neighbors present
neighbors = pd.DataFrame(index=self.get_data('cells').index,columns=['neighbors'])
if self.interaction_map().shape[0] > 0:
neighbors = self.interaction_map().groupby('cell_index').\
apply(lambda x:
dict(zip(
x['neighbor_cell_index'].astype(int),x['pixel_count'].astype(int)
))
).reset_index().rename(columns={0:'neighbors'}).set_index('cell_index')
# only do edges if we have them by setting a null value for default
edge_length = pd.DataFrame(index=self.get_data('cells').index,columns=['edge_length'])
if self.edge_map() is not None:
edge_length = self.edge_map().reset_index().groupby('cell_index').count()[['x']].\
rename(columns={'x':'edge_length'})
edge_length['edge_length'] = edge_length['edge_length'].astype(int)
cell_area = pd.DataFrame(index=self.get_data('cells').index,columns=['cell_area'])
if self.cell_map() is not None:
cell_area = self.cell_map().reset_index().groupby('cell_index').count()[['x']].\
rename(columns={'x':'cell_area'})
cell_area['cell_area'] = cell_area['cell_area'].astype(int)
temp5 = cell_area.merge(edge_length,left_index=True,right_index=True).merge(neighbors,left_index=True,right_index=True,how='left')
temp5.loc[temp5['neighbors'].isna(),'neighbors'] = temp5.loc[temp5['neighbors'].isna(),'neighbors'].apply(lambda x: {}) # these are ones we actuall have measured
temp1 = temp1.merge(temp5,left_index=True,right_index=True,how='left')
temp1.loc[temp1['neighbors'].isna(),'neighbors'] = np.nan # These we were not able to measure
temp1['frame_name'] = self.frame_name
temp1['frame_id'] = self.id
temp1 = temp1.reset_index()
temp1 = temp1.sort_values('cell_index').reset_index(drop=True)
temp1['sample_name'] = 'undefined'
temp1['project_name'] = 'undefined'
temp1['sample_id'] = 'undefined'
temp1['project_id'] = 'undefined'
def _get_phenotype(d):
vals = [k for k,v in d.items() if v == 1]
return np.nan if len(vals) == 0 else vals[0]
temp1['phenotype_label'] = temp1.apply(lambda x:
_get_phenotype(x['phenotype_calls'])
,1)
# Let's tack on the image shape
temp1['frame_shape'] = temp1.apply(lambda x: self.shape,1)
return CellDataFrame(temp1)
def binary_df(self):
temp1 = self.phenotype_calls().stack().reset_index().\
rename(columns={'level_1':'binary_phenotype',0:'score'})
temp1.loc[temp1['score']==1,'score'] = '+'
temp1.loc[temp1['score']==0,'score'] = '-'
temp1['gated'] = 0
temp1.index.name = 'db_id'
return temp1
def cell_df(self):
celldf = self.get_data('cells').\
merge(self.get_data('regions').rename(columns={'image_id':'region_image_id'}),
left_on='region_index',
right_index=True).\
merge(self.get_data('phenotypes'),left_on='phenotype_index',right_index=True).\
merge(self.segmentation_info(),left_index=True,right_index=True,how='left')
return celldf.drop(columns=['phenotype_index','region_index'])
def complete_df(self):
# a dataframe for every cell that has everything
return
[docs] def get_image(self,image_id):
"""
Args:
image_id (str): get the image by this id
Returns:
numpy.array: an image representing a 2d array
"""
return self._images[image_id].copy()
[docs]class CellSampleGeneric(object):
def __init__(self):
self._frames = {}
self._key = None
self._id = uuid4().hex
self.sample_name = np.nan
return
@property
def id(self):
"""
Return the UUID4 str
"""
return self._id
def create_cell_frame_class(self):
return CellFrameGeneric()
@property
def frame_ids(self):
"""
Return the list of frame IDs
"""
return sorted(list(self._frames.keys()))
@property
def key(self):
"""
Return a pandas.DataFrame of info about the sample
"""
return self._key
[docs] def get_frame(self,frame_id):
"""
Args:
frame_id (str): the ID of the frame you want to access
Returns:
CellFrameGeneric: the cell frame
"""
return self._frames[frame_id]
@property
def cdf(self):
"""
Return the pythologist.CellDataFrame of the sample
"""
output = []
for frame_id in self.frame_ids:
temp = self.get_frame(frame_id).cdf
temp['sample_name'] = self.sample_name
temp['sample_id'] = self.id
output.append(temp)
output = pd.concat(output).reset_index(drop=True)
output.index.name = 'db_id'
output['project_name'] = 'undefined'
output['project_id'] = 'undefined'
return CellDataFrame(pd.DataFrame(output))
def to_hdf(self,h5file,location='',mode='w'):
#print(mode)
f = h5py.File(h5file,mode)
#f.create_group(location+'/meta')
#f.create_dataset(location+'/meta/id',data=self.id)
#f.create_dataset(location+'/meta/sample_name',data=self.sample_name)
if location+'/meta' in f:
del f[location+'/meta']
dset = f.create_dataset(location+'/meta', (100,), dtype=h5py.special_dtype(vlen=str))
dset.attrs['sample_name'] = self.sample_name
dset.attrs['id'] = self._id
if location+'/frames' in f:
del f[location+'/frames']
f.create_group(location+'/frames')
f.close()
for frame_id in self.frame_ids:
frame = self._frames[frame_id]
frame.to_hdf(h5file,
location+'/frames/'+frame_id,
mode='a')
self._key.to_hdf(h5file,location+'/info',mode='r+',format='table',complib='zlib',complevel=9)
def read_hdf(self,h5file,location=''):
if location != '': location = location.split('/')
else: location = []
f = h5py.File(h5file,'r')
subgroup = f
for x in location:
subgroup = subgroup[x]
self._id = subgroup['meta'].attrs['id']
self.sample_name = subgroup['meta'].attrs['sample_name']
frame_ids = [x for x in subgroup['frames']]
for frame_id in frame_ids:
cellframe = self.create_cell_frame_class()
loc = '/'.join(location+['frames',frame_id])
#print(loc)
cellframe.read_hdf(h5file,location=loc)
self._frames[frame_id] = cellframe
#self.frame_name = str(subgroup['frames'][frame_id]['meta']['frame_name'])
#self._id = str(subgroup['frames'][frame_id]['meta']['id'])
loc = '/'.join(location+['info'])
#print(loc)
self._key = pd.read_hdf(h5file,loc)
f.close()
return
def cell_df(self):
frames = []
for frame_id in self.frame_ids:
frame = self.get_frame(frame_id).cell_df().reset_index()
key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
key_line['key'] = 1
frame['key'] = 1
frame = key_line.merge(frame,on='key').drop(columns = 'key')
frames.append(frame)
frames = pd.concat(frames).reset_index(drop=True)
frames.index.name = 'sample_cell_index'
return frames
def binary_df(self):
fc = self.cell_df()[['frame_id','cell_index']].reset_index()
frames = []
for frame_id in self.frame_ids:
frame = self.get_frame(frame_id).binary_df()
key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
key_line['key'] = 1
frame['key'] = 1
frame = key_line.merge(frame,on='key').drop(columns = 'key')
frames.append(frame)
return fc.merge(pd.concat(frames).reset_index(drop=True),on=['frame_id','cell_index'])
def interaction_map(self):
fc = self.cell_df()[['frame_id','cell_index']].reset_index()
frames = []
for frame_id in self.frame_ids:
frame = self.get_frame(frame_id).interaction_map()
key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
key_line['key'] = 1
frame['key'] = 1
frame = key_line.merge(frame,on='key').drop(columns = 'key')
frames.append(frame)
frames = pd.concat(frames).reset_index(drop=True)
return frames.merge(fc,on=['frame_id','cell_index']).\
merge(fc.rename(columns={'sample_cell_index':'neighbor_sample_cell_index',
'cell_index':'neighbor_cell_index'}),
on=['frame_id','neighbor_cell_index'])
[docs] def frame_iter(self):
"""
An iterator of frames
Returns:
CellFrameGeneric
"""
for frame_id in self.frame_ids:
yield self.get_frame(frame_id)
[docs]class CellProjectGeneric(object):
def __init__(self,h5path,mode='r'):
"""
Create a CellProjectGeneric object or read from/add to an existing one
Args:
h5path (str): path to read/from or store/to
mode (str): 'r' read, 'a' append, 'w' create/write, 'r+' create/append if necessary
"""
self._key = None
self.h5path = h5path
self.mode = mode
self._sample_cache_name = None
self._sample_cache = None
if mode =='r':
if not os.path.exists(h5path): raise ValueError("Cannot read a file that does not exist")
if mode == 'w' or mode == 'r+':
f = h5py.File(self.h5path,mode)
if '/samples' not in f.keys():
f.create_group('/samples')
if '/meta' not in f.keys():
dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str))
else:
dset = f['/meta']
dset.attrs['project_name'] = np.nan
dset.attrs['microns_per_pixel'] = np.nan
dset.attrs['id'] = uuid4().hex
f.close()
return
def copy(self,path,overwrite=False,output_mode='r'):
if os.path.exists(path) and overwrite is False:
raise ValueError("Cannot overwrite unless overwrite is set to True")
shutil.copy(self.h5path,path)
return self.__class__(path,mode=output_mode)
@classmethod
def concat(self,path,array_like,overwrite=False,verbose=False):
if os.path.exists(path) and overwrite is False:
raise ValueError("Cannot overwrite unless overwrite is set to True")
# copy the first
arr = [x for x in array_like]
if len(arr) == 0: raise ValueError("cannot concat empty list")
if verbose: sys.stderr.write("Copy the first element\n")
cpi = arr[0].copy(path,output_mode='r+',overwrite=overwrite)
#shutil.copy(arr[0].h5path,path)
#cpi = CellProjectGeneric(path,mode='r+')
if len(arr) == 1: return
for project in array_like[1:]:
if verbose: sys.stderr.write("Add project "+str(project.id)+" "+str(project.project_name)+"\n")
for s in project.sample_iter():
if verbose: sys.stderr.write(" Add sample "+str(s.id)+" "+str(s.sample_name)+"\n")
cpi.append_sample(s)
return cpi
[docs] def append_sample(self,sample):
"""
Append sample to the project
Args:
sample (CellSampleGeneric): sample object
"""
if self.mode == 'r': raise ValueError("Error: cannot write to a path in read-only mode.")
sample.to_hdf(self.h5path,location='samples/'+sample.id,mode='a')
current = self.key
if current is None:
current = pd.DataFrame([{'sample_id':sample.id,
'sample_name':sample.sample_name}])
current.index.name = 'db_id'
else:
iteration = max(current.index)+1
addition = pd.DataFrame([{'db_id':iteration,
'sample_id':sample.id,
'sample_name':sample.sample_name}]).set_index('db_id')
current = pd.concat([current,addition])
current.to_hdf(self.h5path,'info',mode='r+',complib='zlib',complevel=9,format='table')
return
[docs] def qc(self,*args,**kwargs):
"""
Returns:
QC: QC class to do quality checks
"""
return QC(self,*args,**kwargs)
@property
def id(self):
"""
Returns the (str) UUID4 string
"""
f = h5py.File(self.h5path,'r')
name = f['meta'].attrs['id']
f.close()
return name
@property
def project_name(self):
"""
Return or set the (str) project_name
"""
f = h5py.File(self.h5path,'r')
name = f['meta'].attrs['project_name']
f.close()
return name
@project_name.setter
def project_name(self,name):
if self.mode == 'r': raise ValueError('cannot write if read only')
f = h5py.File(self.h5path,'r+')
f['meta'].attrs['project_name'] = name
f.close()
@property
def microns_per_pixel(self):
"""
Return or set the (float) microns_per_pixel
"""
f = h5py.File(self.h5path,'r')
name = f['meta'].attrs['microns_per_pixel']
f.close()
return name
@microns_per_pixel.setter
def microns_per_pixel(self,value):
if self.mode == 'r': raise ValueError('cannot write if read only')
f = h5py.File(self.h5path,'r+')
f['meta'].attrs['microns_per_pixel'] = value
f.close()
[docs] def set_id(self,name):
"""
Set the project ID
Args:
name (str): project_id
"""
if self.mode == 'r': raise ValueError('cannot write if read only')
f = h5py.File(self.h5path,'r+')
#dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str))
f['meta'].attrs['id'] = name
f.close()
@property
def cdf(self):
"""
Return the pythologist.CellDataFrame of the project
"""
output = []
for sample_id in self.sample_ids:
temp = self.get_sample(sample_id).cdf
temp['project_name'] = self.project_name
temp['project_id'] = self.id
output.append(temp)
output = pd.concat(output).reset_index(drop=True)
output.index.name = 'db_id'
cdf = CellDataFrame(pd.DataFrame(output))
if self.microns_per_pixel: cdf.microns_per_pixel = self.microns_per_pixel
return cdf
def cell_df(self):
samples = []
for sample_id in self.sample_ids:
sample = self.get_sample(sample_id).cell_df().reset_index()
key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
key_line['key'] = 1
sample['key'] = 1
sample = key_line.merge(sample,on='key').drop(columns = 'key')
samples.append(sample)
samples = pd.concat(samples).reset_index(drop=True)
samples.index.name = 'project_cell_index'
return samples
def binary_df(self):
fc = self.cell_df()[['sample_id','frame_id','cell_index']].reset_index()
samples = []
for sample_id in self.sample_ids:
sample = self.get_sample(sample_id).binary_df()
key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
key_line['key'] = 1
sample['key'] = 1
sample = key_line.merge(sample,on='key').drop(columns = 'key')
samples.append(sample)
return fc.merge(pd.concat(samples).reset_index(drop=True),on=['sample_id','frame_id','cell_index'])
def interaction_map(self):
fc = self.cell_df()[['sample_id','frame_id','cell_index']].reset_index()
samples = []
for sample_id in self.sample_ids:
sample = self.get_sample(sample_id).interaction_map()
key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
key_line['key'] = 1
sample['key'] = 1
sample = key_line.merge(sample,on='key').drop(columns = 'key')
samples.append(sample)
samples = pd.concat(samples).reset_index(drop=True)
return samples.merge(fc,on=['sample_id','frame_id','cell_index']).\
merge(fc.rename(columns={'project_cell_index':'neighbor_project_cell_index',
'cell_index':'neighbor_cell_index'}),
on=['sample_id','frame_id','neighbor_cell_index'])
def create_cell_sample_class(self):
return CellSampleGeneric()
@property
def sample_ids(self):
"""
Return the list of sample_ids
"""
return sorted(list(self.key['sample_id']))
[docs] def get_sample(self,sample_id):
"""
Get the sample_id
Args:
sample_id (str): set the sample id
"""
if self._sample_cache_name == sample_id:
return self._sample_cache
sample = self.create_cell_sample_class()
sample.read_hdf(self.h5path,'samples/'+sample_id)
self._sample_cache_name = sample_id
self._sample_cache = sample
return sample
@property
def key(self):
"""
Get info about the project
"""
f = h5py.File(self.h5path,'r')
val = False
if 'info' in [x for x in f]: val = True
f.close()
return None if not val else pd.read_hdf(self.h5path,'info')
[docs] def sample_iter(self):
"""
An interator of CellSampleGeneric
"""
for sample_id in self.sample_ids: yield self.get_sample(sample_id)
[docs] def frame_iter(self):
"""
An interator of CellFrameGeneric
"""
for s in self.sample_iter():
for frame_id in s.frame_ids:
yield s.get_frame(frame_id)
@property
def channel_image_dataframe(self):
"""
dataframe within info about channels and images
"""
pname = self.project_name
pid = self.id
measurements = []
for s in self.sample_iter():
sname = s.sample_name
sid = s.id
for f in s.frame_iter():
fname = f.frame_name
fid = f.id
mc = f.get_data('measurement_channels')
mc['project_name'] = pname
mc['project_id'] = pid
mc['sample_name'] = sname
mc['sample_id'] = sid
mc['frame_name'] = fname
mc['frame_id'] = fid
mc['processed_image_id'] = f.processed_image_id
measurements.append(mc)
return pd.concat(measurements).reset_index(drop=True)
[docs] def get_image(self,sample_id,frame_id,image_id):
"""
Get an image by sample frame and image id
Args:
sample_id (str): unique sample id
frame_id (str): unique frame id
image_id (str): unique image id
Returns:
numpy.array: 2d image array
"""
s = self.get_sample(sample_id)
f = s.get_frame(frame_id)
return f.get_image(image_id)