Source code for pythologist.interface

import pandas as pd
import numpy as np
from pythologist.measurements import Measurement
from pythologist_image_utilities import watershed_image, map_image_ids
import sys, os, io
import imageio
from PIL import Image
from pythologist import SubsetLogic as SL
from scipy.ndimage import gaussian_filter

[docs]class SegmentationImageOutput(pd.DataFrame): """ The Segmentation Image Output class """ _metadata = [] def __init__(self,*args, **kw): super(SegmentationImageOutput, self).__init__(*args, **kw) if 'verbose' in kw: self.verbose = kw['verbose'] @property def _constructor(self): return SegmentationImageOutput
[docs] def write_to_path(self,path,suffix='',format='png',overwrite=False): """ Output the data the dataframe's 'image' column to a directory structured by project->sample and named by frame Args: path (str): Where to write the directory of images suffix (str): for labeling the imaages you write format (str): default 'png' format to write the file overwrite (bool): default False. if true can overwrite files in the path Modifies: Creates path folder if necessary and writes images to path """ if os.path.exists(path) and overwrite is False: raise ValueError("Error: use ovewrite=True to overwrite images") if not os.path.exists(path): os.makedirs(path) for i,r in self.iterrows(): spath = os.path.join(path,r['project_name'],r['sample_name']) if not os.path.exists(spath): os.makedirs(spath) if suffix == '': fname = os.path.join(spath,r['frame_name']+'.'+format) else: fname = os.path.join(spath,r['frame_name']+'_'+suffix+'.'+format) imageio.imwrite(fname, r['image'],format=format)
[docs]class SegmentationImages(Measurement): """ Class suitable for generating image outputs """ def __init__(self,*args,**kwargs): """ Args: verbose (bool): output more details if True """ super(SegmentationImages,self).__init__(*args,**kwargs) self._edge_map_cache = None self._cell_map_cache = None self._coordinates = None @staticmethod def _preprocess_dataframe(cdf,*args,**kwargs): base = pd.DataFrame(cdf).copy().loc[:,cdf.frame_columns].drop_duplicates() data = [] if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("Reading image names/ids and sizes\n") for sample_id in base['sample_id'].unique().tolist(): if 'verbose' in kwargs and kwargs['verbose']: sys.stderr.write("Reading sample "+str(sample_id)+"\n") s = cdf.db.get_sample(sample_id) for frame_id in base.loc[base['sample_id']==sample_id,'frame_id']: f = s.get_frame(frame_id) data.append([sample_id,frame_id,f.shape]) base = base.merge(pd.DataFrame(data,columns=['sample_id','frame_id','shape']),on=['sample_id','frame_id']) return base def get_coordinates(self): if self._coordinates is not None: return self._coordinates df = self.set_index(self.cdf.frame_columns+['shape']).stack().\ reset_index().rename(columns={'level_7':'image_type',0:'image'}).\ set_index(self.cdf.frame_columns+['shape','image_type']) imgs = [] for i,r in df.iterrows(): if self.verbose: sys.stderr.write("Extracting coordinates from "+str(list(i))+"\n") left = pd.DataFrame([i],columns=df.index.names) left['_key'] = 1 img=map_image_ids(r['image']).groupby('id').apply(lambda x: list(zip(*x[['x','y']].apply(tuple).tolist()))) img = img.reset_index().rename(columns={'id':'cell_index',0:'coords'}) img['_key'] = 1 img = left.merge(img,on='_key').drop(columns='_key') imgs.append(img) imgs = pd.concat(imgs) self._coordinates = imgs return imgs def get_segmentation_map_images(self,type='edge',subset_logic=None,color=None,watershed_steps=0,blank=(0,0,0,255)): if self.verbose: sys.stderr.write("getting segmap "+str(type)+"\n") ems = self.get_segmentation_maps(type=type) subset = self.cdf if subset_logic is not None: subset = self.cdf.subset(subset_logic) ems = ems.merge(subset.loc[:,subset.frame_columns+['cell_index']],on=subset.frame_columns+['cell_index']) #edf = ems.set_index(list(self.columns)) imgs = [] for i,r in self.iterrows(): imsize = r['shape'] img = pd.DataFrame(np.zeros(imsize)) if subset.shape[0] == 0: # case where there is nothing to do if self.verbose: sys.stderr.write("Empty image for this phenotype subset\n") imgs.append(list(r)+[np.array(img)]) continue #edfsub = edf.loc[tuple(r)] edfsub = ems.loc[ems['frame_id']==r['frame_id']].copy().set_index(list(self.columns)) #if self.verbose: sys.stderr.write("make image and fill zeros\n") fullx = pd.DataFrame({'x':list(range(0,imsize[1]))}) fullx['_key']=1 fully = pd.DataFrame({'y':list(range(0,imsize[0]))}) fully['_key']=1 full = fullx.merge(fully,on='_key').merge(edfsub,on=['x','y'],how='left').fillna(0) img = np.array(full.pivot(columns='x',index='y',values='cell_index').astype(int)) if map_image_ids(img).shape[0] == 0: # There is nothing for us to draw with this phenotype and image imgs.append(list(r)+[np.zeros(imsize)]) continue #if self.verbose: sys.stderr.write("finished making image and fill zeros\n") if watershed_steps > 0: # get the zero and nonzero components mid = map_image_ids(img,remove_zero=False) midzero = list(zip(*mid.query('id==0').copy()[['x','y']].apply(tuple).tolist())) mid = list(zip(*mid.query('id!=0').copy()[['x','y']].apply(tuple).tolist())) img = watershed_image(np.array(img),mid,midzero,steps=watershed_steps) if color is not None: # we need to make a new image thats colored in fresh = np.zeros(list(imsize)+[len(color)]).astype(int) blank = tuple(list(blank)[0:len(color)]) fresh[:][:] = blank # get our coordinates coords = np.array(list(zip(*map_image_ids(img)[['y','x']].apply(tuple).tolist()))) fresh[tuple([*coords.T])] = color #for i2,r2 in map_image_ids(img).iterrows(): # fresh[r2['y']][r2['x']] = color img = fresh imgs.append(list(r)+[img]) imgs = pd.DataFrame(imgs,columns=list(self.columns)+['image']) return imgs
[docs] def build_segmentation_image(self,schema,background=(0,0,0,0)): """ Put together an image. Defined by a list of layers with RGBA colors Make the schema example | schema = [ | {'subset_logic':SL(phenotypes=['SOX10+']), | 'edge_color':(31, 31, 46,255), | 'watershed_steps':0, | 'fill_color':(51, 51, 77,255) | }, | {'subset_logic':SL(phenotypes=['CD8+'],scored_calls={'PD1':'+'}), | 'edge_color':(255,0,0,255), | 'watershed_steps':1, | 'fill_color':(0,0,0,255) | }, | {'subset_logic':SL(phenotypes=['CD8+'],scored_calls={'PD1':'-'}), | 'edge_color':(255,0,255,255), | 'watershed_steps':1, | 'fill_color':(0,0,255,255) | } | ] | imgs = imageaccess.build_segmentation_image(schema,background=(0,0,0,255)) Args: schema (list): a list of layers (see example above) background (tuple): a color RGBA 0-255 tuple for the. background color Returns: SegmentationImageOutput: an output suitable for writing images """ cummulative = self.copy() def _set_blank(img,blank): img[:][:] = blank return img cummulative['merged'] = cummulative.apply(lambda x: _set_blank(np.zeros(list(x['shape'])+[4]),background) ,1) for layer in schema: if self.verbose: sys.stderr.write("Calculating layer "+str(layer)+"\n") images = self.get_outline_images(subset_logic=layer['subset_logic'], edge_color=layer['edge_color'], watershed_steps=layer['watershed_steps'], fill_color=layer['fill_color']) cummulative = cummulative.rename(columns={'merged':'old'}) cummulative = cummulative.merge(images,on=list(self.columns)) cummulative['new'] = cummulative.apply(lambda x: _merge_images(x['merged'],x['old']),1) cummulative = cummulative.drop(columns=['old','merged']).rename(columns={'new':'merged'}) cummulative = cummulative.rename(columns={'merged':'image'}) return SegmentationImageOutput(cummulative)
def get_outline_images(self,subset_logic=None,edge_color=(0,0,255,255),fill_color=(135,206,250,255),watershed_steps=1): if len(edge_color) == 3: edge_color = tuple(list(edge_color)+[255]) if len(fill_color) == 3: fill_color = tuple(list(fill_color)+[255]) #if self.verbose: sys.stderr.write("getting outline image\n") #if self.verbose: sys.stderr.write("reading edges\n") edge_images = self.get_segmentation_map_images(type='edge',subset_logic=subset_logic,color=edge_color,blank=(0,0,0,0),watershed_steps=watershed_steps).\ rename(columns={'image':'edge'}) if self.verbose: sys.stderr.write("reading cells\n") cell_images = self.get_segmentation_map_images(type='cell',subset_logic=subset_logic,color=fill_color,blank=(0,0,0,0)).\ rename(columns={'image':'cell'}) if self.verbose: sys.stderr.write("merge edge and cell\n") v = edge_images.merge(cell_images,on=list(self.columns)) v['merged'] = v.apply(lambda x: _merge_images(x['edge'],x['cell']),1) #if self.verbose: sys.stderr.write("finished outline image\n") return v.drop(columns=['cell','edge']) def get_segmentation_maps(self,type='edge'): if type == 'edge' and self._edge_map_cache is not None: return self._edge_map_cache if type == 'cell' and self._cell_map_cache is not None: return self._cell_map_cache if self.verbose: sys.stderr.write("The "+str(type)+" map has not been calculated yet. ... computing.\n") outputs = self.apply_frames(lambda x: x.edge_map() if type == 'edge' else x.cell_map()) dfs = [] for i,r in outputs.iterrows(): df = r['output'] df.index = pd.Index([i for temp in range(0,r['output'].shape[0])]) df.index.set_names(list(self.columns),inplace=True) dfs.append(df) dfs = pd.concat(dfs).reset_index() # we don't need every cell.. just the ones in this cell data frame dfs = dfs.merge(self.cdf[list(self.cdf.frame_columns)+['cell_index']],on=list(self.cdf.frame_columns)+['cell_index']) if type == 'edge': self._edge_map_cache = dfs elif type == 'cell': self._cell_map_cache = dfs else: raise ValueError('edge or cell') if self.verbose: sys.stderr.write("The "+str(type)+" map is finished.\n") return dfs def apply_frames(self,func): samples = self['sample_id'].unique().tolist() data = [] for sample_id in samples: s = self.cdf.db.get_sample(sample_id) if self.verbose: sys.stderr.write("Read in sample "+s.sample_name+" ("+str(sample_id)+")\n") for frame_id in self.loc[self['sample_id']==sample_id,'frame_id']: f = s.get_frame(frame_id) data.append([sample_id,frame_id,func(f)]) return self.merge(pd.DataFrame(data,columns=['sample_id','frame_id','output']),on=['sample_id','frame_id']).set_index(list(self.columns))
def _merge_images(image1,image2): edge = np.uint8(image1) cell = np.uint8(image2) bedge = io.BytesIO() bcell = io.BytesIO() imageio.imwrite(bedge, edge,format='tif') imageio.imwrite(bcell, cell,format='tif') pedge = Image.open(bedge) pcell = Image.open(bcell) pcell.paste(pedge, (0, 0), pedge) return np.array(pcell) def _get_new_regions(cdf,sample,frame_id,unset_label='undefined',gaussian_sigma=66,verbose=False): sub = cdf.loc[cdf['frame_id']==frame_id] #print(sub.iloc[0][['project_name','sample_name','frame_name']]) #print(sub.shape) shape = sub.iloc[0]['frame_shape'] dfs = {} sid = sub.iloc[0]['sample_id'] fid = sub.iloc[0]['frame_id'] proc = cdf.db.get_sample(sid).get_frame(fid).processed_image present = sub['phenotype_label'].unique() for p in sub.phenotypes: empty = np.zeros(shape) if p not in present: dfs[p] = empty.copy().astype(float) continue emap = pd.DataFrame(empty).stack().reset_index().astype(int) emap.columns = ['y','x','id'] emap = emap.drop(columns='id') sel = sub.subset(SL(phenotypes=[p]))[['x','y']].drop_duplicates() sel['id'] = 1 sel = emap.merge(sel,on=['x','y'],how='left').fillna(0).pivot(columns='x',index='y',values='id') sel = np.array(sel).astype(float) blur = gaussian_filter(sel,gaussian_sigma) dfs[p] = blur regions = {} #print('mask') remainder = np.ones(shape).astype(bool) for p in dfs: others = set(dfs.keys())-set([p]) result = np.ones(dfs[p].shape).astype(bool) for o in others: result = (dfs[p] > dfs[o])&result&proc #print(p) result = result.astype(np.uint8) remainder = remainder&(~result.astype(bool)) regions[p] = result.astype(np.uint8) regions[unset_label] = (remainder&proc).astype(np.uint8) return regions def phenotypes_to_regions(cdf,path, gaussian_sigma=66, verbose=False, overwrite=False, unset_label='undefined', project_name='region-refactor'): def _get_label(regions_dict,x,y): for label in regions_dict: if regions_dict[label][y][x] == 1: return label return np.nan if os.path.exists(path) and not overwrite: raise ValueError("cannot overwrite unless overwrite is True") if cdf.db is None: raise ValueError("You need the storage object set for this function") output = cdf.db.__class__(path,mode='w') output.project_name = project_name dfs = {} ocdf = cdf.copy() subs = [] for sample_id in cdf['sample_id'].unique(): sample = cdf.db.get_sample(sample_id) if verbose: sys.stderr.write("==========\nSample: "+sample.sample_name+"\n") for frame_id in cdf.loc[(cdf['sample_id']==sample_id),'frame_id'].unique(): #sub = ocdf.loc[(ocdf['sample_id']==sample_id)&(ocdf['frame_id']==frame_id)].copy() regions = _get_new_regions(cdf,sample,frame_id, verbose=verbose, gaussian_sigma=gaussian_sigma, unset_label = unset_label ) dfs[frame_id] = regions f = sample.get_frame(frame_id) f.set_regions(regions, use_processed_region=True, unset_label='undefined2', verbose=verbose) if verbose: sys.stderr.write(" "+f.frame_name+"\n") output.append_sample(sample) # Now update the cell dataframe if verbose: sys.stderr.write("update phenotype_labels\n") ocdf['region_label'] = ocdf.apply(lambda x: _get_label(dfs[x['frame_id']],x['x'],x['y']),1) ocdf = ocdf.loc[~ocdf['region_label'].isna()].copy() region_sizes = {} for fid in dfs: region_sizes[fid] = {} for label in dfs[fid]: region_sizes[fid][label] = int(dfs[fid][label].astype(int).sum().sum()) ocdf['regions'] = ocdf.apply(lambda x: region_sizes[x['frame_id']],1) return ocdf, output def get_region_images(cdf,output_path,colors,background_color='#000000',overwrite=False,format='png',verbose=False): def hex_to_rgb(h): h = h.lstrip('#') v = tuple(list(int(h[i:i+2], 16) for i in (0, 2, 4))+[255]) return [x/255 for x in v] def write_regions(frame,basedir,colors,background_color,format): rshape = frame.get_data('regions').iloc[0]['image_id'] rshape = frame.get_image(rshape).shape start = np.zeros(list(rshape)+[4]) start[:,:]=hex_to_rgb(background_color) fname = frame.frame_name for i,r in frame.get_data('regions').iterrows(): col = colors[r['region_label']] img = frame.get_image(r['image_id']) start[img==1]=hex_to_rgb(col) imageio.imwrite(os.path.join(basedir,fname+'.'+format), start,format=format) if not cdf.db: raise ValueError("Need db set") if os.path.exists(output_path) and not overwrite: raise ValueError("overwrite is set to False") #os.makedirs(output_path) #for s in cdf.db.sample_iter(): # os.makedirs(os.path.join(output_path,s.sample_name)) values = cdf.loc[:,['project_name','sample_id','frame_name','frame_id']].drop_duplicates() for pname in values['project_name'].unique(): samples = values.loc[values['project_name']==pname] for sid in samples['sample_id'].unique(): s = cdf.db.get_sample(sid) sname = s.sample_name if verbose: sys.stderr.write(str((pname,sname))+"\n") basedir = os.path.join(output_path,pname,sname) if not os.path.exists(basedir): os.makedirs(basedir) frames = samples.loc[samples['sample_id']==sid] for fid in frames['frame_id'].unique(): write_regions(s.get_frame(fid),basedir,colors,background_color,format)