Source code for pythologist_reader

import pandas as pd
import numpy as np
import h5py, os, json, sys, shutil
from uuid import uuid4
from pythologist_image_utilities import map_image_ids
from pythologist_reader.qc import QC
from pythologist import CellDataFrame

"""
These are classes to help deal with cell-level image data
"""

[docs]class CellFrameGeneric(object):
    """
    A generic CellFrameData object
    """
    def __init__(self):
        self._processed_image_id = None
        self._images = {}                      # Database of Images
        self._id = uuid4().hex
        self.frame_name = None
        self.data_tables = {
        'cells':{'index':'cell_index',            
                  'columns':['x','y','phenotype_index',
                             'region_index']},
        'cell_tags':{'index':'db_id',            
                     'columns':['tag_index','cell_index']},
        'cell_measurements':{'index':'measurement_index', 
                             'columns':['cell_index','statistic_index','feature_index','channel_index','value']},
        'measurement_features':{'index':'feature_index',
                                'columns':['feature_label']},
        'measurement_channels':{'index':'channel_index',
                                'columns':['channel_label','channel_abbreviation','image_id']},
        'measurement_statistics':{'index':'statistic_index',
                                  'columns':['statistic_label']},
        'phenotypes':{'index':'phenotype_index',
                      'columns':['phenotype_label']},
        'segmentation_images':{'index':'db_id',
                 'columns':['segmentation_label','image_id']},                     
        'regions':{'index':'region_index',
                   'columns':['region_label','region_size','image_id']},
        'cell_interactions':{'index':'db_id', 
                             'columns':['cell_index','neighbor_cell_index','pixel_count','touch_distance']},
        'tags':{'index':'tag_index',
                'columns':['tag_label']}
                           }
        self._data = {} # Do not acces directly. Use set_data_table and get_data_table to access.
        for x in self.data_tables.keys(): 
            self._data[x] = pd.DataFrame(columns=self.data_tables[x]['columns'])
            self._data[x].index.name = self.data_tables[x]['index']
    @property
    def id(self):
        """
        Returns the project UUID4
        """
        return self._id

    @property
    def shape(self):
        """
        Returns the (tuple) shape of the image (rows,columns)
        """
        return self.processed_image.shape
    
    @property
    def processed_image_id(self):
        """
        Returns (str) id of the frame object
        """
        return self._processed_image_id
    @property
    def processed_image(self):
        """
        Returns (numpy.array) of the processed_image
        """
        return self._images[self._processed_image_id].copy()
[docs]    def set_processed_image_id(self,image_id):
        """
        Args:
            image_id (str): set the id of the frame object
        """
        self._processed_image_id = image_id

    @property
    def table_names(self):
        """
        Return a list of data table names
        """
        return list(self.data_tables.keys())

[docs]    def set_data(self,table_name,table):
        """
        Set the data table

        Args:
            table_name (str): the table name 
            table (pd.DataFrame): the input table
        """
        # Assign data to the standard tables. Do some column name checking to make sure we are getting what we expect
        if table_name not in self.data_tables: raise ValueError("Error table name doesn't exist in defined formats")
        if set(list(table.columns)) != set(self.data_tables[table_name]['columns']): raise ValueError("Error column names don't match defined format\n"+\
                                                                                            str(list(table.columns))+"\n"+\
                                                                                            str(self.data_tables[table_name]['columns']))
        if table.index.name != self.data_tables[table_name]['index']: raise ValueError("Error index name doesn't match defined format")
        self._data[table_name] = table.loc[:,self.data_tables[table_name]['columns']].copy() # Auto-sort, and assign a copy so we aren't ever assigning by reference

[docs]    def set_regions(self,regions,use_processed_region=True,unset_label='undefined',verbose=False):
        """
        Alter the regions in the frame

        Args:
            regions (dict): a dictionary of mutually exclusive region labels and binary masks
                            if a region does not cover all the workable areas then it will be the only label
                            and the unused area will get the 'unset_label' as a different region
            use_processed_region (bool): default True keep the processed region subtracted
            unset_label (str): name of unset regions default (undefined)
        """
        
        # delete our current regions

        regions = regions.copy()
        image_ids = list(self.get_data('mask_images')['image_id'])
        image_ids = [x for x in image_ids if x != self.processed_image_id]
        for image_id in image_ids: del self._images[image_id]

        labels = list(regions.keys())
        ids = [uuid4().hex for x in labels]
        sizes = [regions[x].sum() for x in labels]
        remainder = np.ones(self.processed_image.shape)
        if use_processed_region: remainder = self.processed_image

        for i,label in enumerate(labels):
            my_image = regions[label]
            if use_processed_region: my_image = my_image&self.processed_image
            self._images[ids[i]] = my_image
            remainder = remainder & (~my_image)

        if verbose: sys.stderr.write("Remaining areas after setting are "+str(remainder.sum().sum())+"\n")

        if remainder.sum().sum() > 0:
            labels += [unset_label]
            sizes += [remainder.sum().sum()]
            ids += [uuid4().hex]
            self._images[ids[-1]] = remainder
            regions[unset_label] = remainder

        regions2 = pd.DataFrame({'region_label':labels,
                                 'region_size':sizes,
                                 'image_id':ids
                                })
        regions2.index.name = 'region_index'
        self.set_data('regions',regions2)
        def get_label(x,y,regions_dict):
            for label in regions_dict:
                if regions_dict[label][y][x] == 1: return label
            return np.nan
            raise ValueError("Coordinate is out of bounds for all regions.")
        recode = self.get_data('cells').copy()
        recode['new_region_label'] = recode.apply(lambda x: get_label(x['x'],x['y'],regions),1)
        ## see how many we need to drop because the centroid fall in an unprocessed region
        if verbose: sys.stderr.write(str(recode.loc[recode['new_region_label'].isna()].shape[0])+" cells with centroids beyond the processed region are being dropped\n")
        recode = recode.loc[~recode['new_region_label'].isna()].copy()
        recode = recode.drop(columns='region_index').reset_index().\
            merge(regions2[['region_label']].reset_index(),
                  left_on='new_region_label',right_on='region_label').\
            drop(columns=['region_label','new_region_label']).set_index('cell_index')
        self.set_data('cells',recode)
        return


[docs]    def get_data(self,table_name): 
        """
        Get the data table

        Args:
            table_name (pandas.DataFrame): the table you access by name
        """
        return self._data[table_name].copy()

    def read_hdf(self,h5file,location=''):
        if location != '': location = location.split('/')
        else: location = []
        f = h5py.File(h5file,'r')
        subgroup = f
        for x in location:
            subgroup = subgroup[x]
        table_names = [x for x in subgroup['data']]
        for table_name in table_names:
            loc = '/'.join(location+['data',table_name])
            #print(loc)
            self.set_data(table_name,pd.read_hdf(h5file,loc))
        # now get images
        image_names = [x for x in subgroup['images']]
        for image_name in image_names:
            self._images[image_name] = np.array(subgroup['images'][image_name])
        self.frame_name = subgroup['meta'].attrs['frame_name']
        self._id = subgroup['meta'].attrs['id']
        self.set_processed_image_id(subgroup['meta'].attrs['processed_image_id'])
        return

    def to_hdf(self,h5file,location='',mode='w'):
        f = h5py.File(h5file,mode)
        f.create_group(location+'/data')
        f.create_group(location+'/images')
        #f.create_group(location+'/meta')
        f.close()
        for table_name in self.data_tables.keys():
            data_table = self.get_data(table_name)
            data_table.to_hdf(h5file,
                              location+'/data/'+table_name,
                              mode='a',
                              format='table',
                              complib='zlib',
                              complevel=9)
        f = h5py.File(h5file,'a')
        for image_id in self._images.keys():
            f.create_dataset(location+'/images/'+image_id,data=self._images[image_id],compression='gzip',compression_opts=9)
        dset = f.create_dataset(location+'/meta', (100,), dtype=h5py.special_dtype(vlen=str))
        dset.attrs['frame_name'] = self.frame_name
        dset.attrs['processed_image_id'] = self.processed_image_id
        dset.attrs['id'] = self._id
        f.close()

[docs]    def cell_map(self):
        """
        Return a dataframe of cell ID's and locations
        """
        if 'cell_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
        cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['cell_map','image_id']
        return map_image_ids(self.get_image(cmid)).rename(columns={'id':'cell_index'})

[docs]    def cell_map_image(self):
        """
        Return a the image of cells by ID's
        """
        if 'cell_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
        cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['cell_map','image_id']
        return self.get_image(cmid)

[docs]    def edge_map(self):
        """
        Return a dataframe of cells by ID's of coordinates only on the edge of the cells
        """
        if 'edge_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
        cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['edge_map','image_id']
        return map_image_ids(self.get_image(cmid)).\
                   rename(columns={'id':'cell_index'})

[docs]    def edge_map_image(self):
        """
        Return an image of edges of integers by ID
        """
        if 'edge_map' not in list(self.get_data('segmentation_images')['segmentation_label']): return None
        cmid = self.get_data('segmentation_images').set_index('segmentation_label').loc['edge_map','image_id']
        return self.get_image(cmid)

[docs]    def segmentation_info(self):
        """
        Return a dataframe with info about segmentation like cell areas and circumferences
        """
        
        # handle the case where there is no edge data
        if self.edge_map() is None:
            return pd.DataFrame(index=self.get_data('cells').index,columns=['edge_pixels','area_pixels'])

        return self.edge_map().reset_index().groupby(['cell_index']).count()[['x']].rename(columns={'x':'edge_pixels'}).\
            merge(self.cell_map().reset_index().groupby(['cell_index']).count()[['x']].rename(columns={'x':'area_pixels'}),
                  left_index=True,
                  right_index=True).reset_index().set_index('cell_index')
[docs]    def interaction_map(self):
        """
        Returns:
            pandas.DataFrame: return a dataframe of which cells are in contact with one another
        """
        return self.get_data('cell_interactions')
[docs]    def set_interaction_map(self,touch_distance=1):
        """
        Measure the cell-cell contact interactions

        Args:
            touch_distance (int): optional default is 1 distance to look away from a cell for another cell
        """
        full = self.cell_map()
        edge = self.edge_map()
        if full is None or edge is None: return None
        d1 = edge.reset_index()
        d1['key'] = 1
        d2 = pd.DataFrame({'mod':[-1*touch_distance,0,touch_distance]})
        d2['key'] = 1
        d3 = d1.merge(d2,on='key').merge(d2,on='key')
        d3['x'] = d3['x'].add(d3['mod_x'])
        d3['y'] = d3['y'].add(d3['mod_y'])
        d3 = d3[['x','y','cell_index','key']].rename(columns={'cell_index':'neighbor_cell_index'})
        im = full.reset_index().merge(d3,on=['x','y']).\
            query('cell_index!=neighbor_cell_index').\
            drop_duplicates().groupby(['cell_index','neighbor_cell_index']).count()[['key']].reset_index().\
            rename(columns={'key':'pixel_count'})
        im['touch_distance'] = touch_distance
        im.index.name='db_id'
        self.set_data('cell_interactions',im)

    @property
    def thresholds(self):
        raise ValueError('Override this to use it.')

[docs]    def get_channels(self,all=False):
        """
        Return a dataframe of the Channels

        Args:
            all (bool): default False if all is set to true will also include excluded channels (like autofluoresence)

        Returns:
            pandas.DataFrame: channel information
        """
        if all: return self.get_data('measurement_channels')
        d = self.get_data('measurement_channels')
        return d.loc[~d['channel_label'].isin(self.excluded_channels)]
    def get_regions(self):
        return self.get_data('regions')
    
[docs]    def get_raw(self,feature_label,statistic_label,all=False,channel_abbreviation=True):
        """
        Get the raw data

        Args:
            feature_label (str): name of the feature
            statistic_label (str): name of the statistic to extract
            all (bool): default False if True put out everything including excluded channels
            channel_abbreviation (bool): default True means use the abbreivations if available

        Returns:
            pandas.DataFrame: the dataframe
        """
        stats = self.get_data('measurement_statistics').reset_index()
        stats = stats.loc[stats['statistic_label']==statistic_label,'statistic_index'].iloc[0]
        feat = self.get_data('measurement_features').reset_index()
        feat = feat.loc[feat['feature_label']==feature_label,'feature_index'].iloc[0]
        #region = self.get_data('regions').reset_index()
        #region = region.loc[region['region_label']==region_label,'region_index'].iloc[0]
        measure = self.get_data('cell_measurements')
        measure = measure.loc[(measure['statistic_index']==stats)&(measure['feature_index']==feat)]
        channels = self.get_data('measurement_channels')
        if not all: channels = channels.loc[~channels['channel_label'].isin(self.excluded_channels)]
        measure = measure.merge(channels,left_on='channel_index',right_index=True)
        measure = measure.reset_index().pivot(index='cell_index',columns='channel_label',values='value')
        if not channel_abbreviation: return measure
        temp = dict(zip(self.get_data('measurement_channels')['channel_label'],
                        self.get_data('measurement_channels')['channel_abbreviation']))
        return measure.rename(columns=temp)

    def default_raw(self):
        # override this
        return None

    def copy(self):
        mytype = type(self)
        them = mytype()
        for x in self.data_tables.keys():
            them._data[x] = self._data[x].copy()
        return them

    @property
    def excluded_channels(self):
        raise ValueError("Must be overridden")

[docs]    def binary_calls(self):
        """
        Return all the binary feature calls (alias)
        """
        return phenotype_calls()

[docs]    def phenotype_calls(self):
        """
        Return all the binary feature calls
        """
        phenotypes = self.get_data('phenotypes')['phenotype_label'].dropna().tolist()
        temp = pd.DataFrame(index=self.get_data('cells').index,columns=phenotypes)
        temp = temp.fillna(0)
        temp = temp.merge(self.cell_df()[['phenotype_label']],left_index=True,right_index=True)
        for phenotype in phenotypes:
            temp.loc[temp['phenotype_label']==phenotype,phenotype]=1
        return temp.drop(columns='phenotype_label').astype(np.int8)

    def scored_calls(self):
        # Must be overridden
        return None
        

    @property
    def cdf(self):
        """
        Return the pythologist.CellDataFrame of the frame
        """

        # get our region sizes
        region_sizes = self.get_data('regions').set_index('region_label')['region_size'].astype(int).to_dict()
        # get our cells
        temp1 = self.get_data('cells').drop(columns='phenotype_index').\
                       merge(self.get_data('regions'),
                             left_on='region_index',
                             right_index=True).drop(columns=['image_id','region_index','region_size'])
        temp1['regions'] = temp1.apply(lambda x: region_sizes,1)
        temp2 = self.scored_calls()
        if temp2  is not None:
            temp2 = temp2.apply(lambda x:
                dict(zip(
                    list(x.index),
                    list(x)
                 ))
            ,1).reset_index().rename(columns={0:'scored_calls'}).set_index('cell_index')
            temp1 = temp1.merge(temp2,left_index=True,right_index=True)
        else:
            temp1['scored_calls'] = temp1.apply(lambda x: {},1)
        temp3 = self.phenotype_calls().apply(lambda x:
                dict(zip(
                    list(x.index),
                    list(x)
                ))
            ,1).reset_index().rename(columns={0:'phenotype_calls'}).set_index('cell_index')
        
        temp1 = temp1.merge(temp3,left_index=True,right_index=True)
        #temp1['phenotypes_present'] = json.dumps(list(
        #        sorted([x for x in self.get_data('phenotypes')['phenotype_label'] if x is not np.nan])
        #    ))

        temp4 = None
        # extract default values only if we have whole cell
        if "Whole Cell" in self.get_data('measurement_features')['feature_label'].tolist():
            temp4 = self.default_raw()
        if temp4 is not None:
            temp4 = temp4.apply(lambda x:
                dict(zip(
                    list(x.index),
                    list(x)
                ))
            ,1).reset_index().rename(columns={0:'channel_values'}).set_index('cell_index')
            temp1 = temp1.merge(temp4,left_index=True,right_index=True)
        else:
            temp1['channel_values'] = np.nan

        #temp5 = self.interaction_map().groupby('cell_index').\
        #    apply(lambda x: json.dumps(list(sorted(x['neighbor_cell_index'])))).reset_index().\
        #    rename(columns={0:'neighbor_cell_index'}).set_index('cell_index')


        # Get neighbor data .. may not be available for all cells
        #    Set a default of a null frame and only try and set if there are some neighbors present
        neighbors = pd.DataFrame(index=self.get_data('cells').index,columns=['neighbors'])
        if self.interaction_map().shape[0] > 0:
            neighbors = self.interaction_map().groupby('cell_index').\
                apply(lambda x:
                    dict(zip(
                        x['neighbor_cell_index'].astype(int),x['pixel_count'].astype(int)
                    ))
                ).reset_index().rename(columns={0:'neighbors'}).set_index('cell_index')

        # only do edges if we have them by setting a null value for default
        edge_length = pd.DataFrame(index=self.get_data('cells').index,columns=['edge_length'])
        if self.edge_map() is not None:
            edge_length = self.edge_map().reset_index().groupby('cell_index').count()[['x']].\
                rename(columns={'x':'edge_length'})
            edge_length['edge_length'] = edge_length['edge_length'].astype(int)

        cell_area = pd.DataFrame(index=self.get_data('cells').index,columns=['cell_area'])
        if self.cell_map() is not None:
            cell_area = self.cell_map().reset_index().groupby('cell_index').count()[['x']].\
                rename(columns={'x':'cell_area'})
            cell_area['cell_area'] = cell_area['cell_area'].astype(int)

        temp5 = cell_area.merge(edge_length,left_index=True,right_index=True).merge(neighbors,left_index=True,right_index=True,how='left')
        temp5.loc[temp5['neighbors'].isna(),'neighbors'] = temp5.loc[temp5['neighbors'].isna(),'neighbors'].apply(lambda x: {}) # these are ones we actuall have measured

        temp1 = temp1.merge(temp5,left_index=True,right_index=True,how='left')
        temp1.loc[temp1['neighbors'].isna(),'neighbors'] = np.nan # These we were not able to measure


        temp1['frame_name'] = self.frame_name
        temp1['frame_id'] = self.id
        temp1  = temp1.reset_index()
        temp1 = temp1.sort_values('cell_index').reset_index(drop=True)
        temp1['sample_name'] = 'undefined'
        temp1['project_name'] = 'undefined'
        temp1['sample_id'] = 'undefined'
        temp1['project_id'] = 'undefined'
        def _get_phenotype(d):
            vals = [k for k,v in d.items() if v ==  1]
            return np.nan if len(vals) == 0 else vals[0]
        temp1['phenotype_label'] = temp1.apply(lambda x:
                  _get_phenotype(x['phenotype_calls'])
            ,1)
        # Let's tack on the image shape
        temp1['frame_shape'] = temp1.apply(lambda x: self.shape,1)
        return CellDataFrame(temp1)

    def binary_df(self):
        temp1 = self.phenotype_calls().stack().reset_index().\
            rename(columns={'level_1':'binary_phenotype',0:'score'})
        temp1.loc[temp1['score']==1,'score'] = '+'
        temp1.loc[temp1['score']==0,'score'] = '-'
        temp1['gated'] = 0
        temp1.index.name = 'db_id'
        return temp1

    def cell_df(self):
        celldf = self.get_data('cells').\
            merge(self.get_data('regions').rename(columns={'image_id':'region_image_id'}),
                  left_on='region_index',
                  right_index=True).\
            merge(self.get_data('phenotypes'),left_on='phenotype_index',right_index=True).\
            merge(self.segmentation_info(),left_index=True,right_index=True,how='left')
        return celldf.drop(columns=['phenotype_index','region_index'])

    def complete_df(self):
        # a dataframe for every cell that has everything
        return

[docs]    def get_image(self,image_id):
        """
        Args:
            image_id (str): get the image by this id

        Returns:
            numpy.array: an image representing a 2d array
        """
        return self._images[image_id].copy()

[docs]class CellSampleGeneric(object):
    def __init__(self):
        self._frames = {}
        self._key = None
        self._id = uuid4().hex
        self.sample_name = np.nan
        return

    @property
    def id(self):
        """
        Return the UUID4 str
        """
        return self._id

    def create_cell_frame_class(self):
        return CellFrameGeneric()

    @property
    def frame_ids(self):
        """
        Return the list of frame IDs
        """
        return sorted(list(self._frames.keys()))

    @property
    def key(self):
        """
        Return a pandas.DataFrame of info about the sample
        """
        return self._key

[docs]    def get_frame(self,frame_id):
        """
        Args:
            frame_id (str): the ID of the frame you want to access

        Returns:
            CellFrameGeneric: the cell frame
        """
        return self._frames[frame_id]

    @property
    def cdf(self):
        """
        Return the pythologist.CellDataFrame of the sample
        """
        output = []
        for frame_id in self.frame_ids:
            temp = self.get_frame(frame_id).cdf
            temp['sample_name'] = self.sample_name
            temp['sample_id'] = self.id
            output.append(temp)
        output = pd.concat(output).reset_index(drop=True)
        output.index.name = 'db_id'
        output['project_name'] = 'undefined'
        output['project_id'] = 'undefined'
        return CellDataFrame(pd.DataFrame(output))


    def to_hdf(self,h5file,location='',mode='w'):
        #print(mode)
        f = h5py.File(h5file,mode)
        #f.create_group(location+'/meta')
        #f.create_dataset(location+'/meta/id',data=self.id)
        #f.create_dataset(location+'/meta/sample_name',data=self.sample_name)
        if location+'/meta' in f:
            del f[location+'/meta']
        dset = f.create_dataset(location+'/meta', (100,), dtype=h5py.special_dtype(vlen=str))
        dset.attrs['sample_name'] = self.sample_name
        dset.attrs['id'] = self._id
        if location+'/frames' in f:
            del f[location+'/frames']
        f.create_group(location+'/frames')
        f.close()
        for frame_id in self.frame_ids:
            frame = self._frames[frame_id]
            frame.to_hdf(h5file,
                         location+'/frames/'+frame_id,
                          mode='a')
        self._key.to_hdf(h5file,location+'/info',mode='r+',format='table',complib='zlib',complevel=9)


    def read_hdf(self,h5file,location=''):
        if location != '': location = location.split('/')
        else: location = []
        f = h5py.File(h5file,'r')
        subgroup = f
        for x in location:
            subgroup = subgroup[x]
        self._id = subgroup['meta'].attrs['id']
        self.sample_name = subgroup['meta'].attrs['sample_name']
        frame_ids = [x for x in subgroup['frames']]
        for frame_id in frame_ids:
            cellframe = self.create_cell_frame_class()
            loc = '/'.join(location+['frames',frame_id])
            #print(loc)
            cellframe.read_hdf(h5file,location=loc)
            self._frames[frame_id] = cellframe
            #self.frame_name = str(subgroup['frames'][frame_id]['meta']['frame_name'])
            #self._id = str(subgroup['frames'][frame_id]['meta']['id'])
        loc = '/'.join(location+['info'])
        #print(loc)
        self._key = pd.read_hdf(h5file,loc)
        f.close()
        return

    def cell_df(self):
        frames = []
        for frame_id in self.frame_ids:
            frame = self.get_frame(frame_id).cell_df().reset_index()
            key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
            key_line['key'] = 1
            frame['key'] = 1
            frame = key_line.merge(frame,on='key').drop(columns = 'key')
            frames.append(frame)
        frames = pd.concat(frames).reset_index(drop=True)
        frames.index.name = 'sample_cell_index'
        return frames

    def binary_df(self):
        fc = self.cell_df()[['frame_id','cell_index']].reset_index()
        frames = []
        for frame_id in self.frame_ids:
            frame = self.get_frame(frame_id).binary_df()
            key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
            key_line['key'] = 1
            frame['key'] = 1
            frame = key_line.merge(frame,on='key').drop(columns = 'key')
            frames.append(frame)
        return fc.merge(pd.concat(frames).reset_index(drop=True),on=['frame_id','cell_index'])

    def interaction_map(self):
        fc = self.cell_df()[['frame_id','cell_index']].reset_index()
        frames = []
        for frame_id in self.frame_ids:
            frame = self.get_frame(frame_id).interaction_map()
            key_line = self.key.set_index('frame_id').loc[[frame_id]].reset_index()
            key_line['key'] = 1
            frame['key'] = 1
            frame = key_line.merge(frame,on='key').drop(columns = 'key')
            frames.append(frame)
        frames = pd.concat(frames).reset_index(drop=True)
        return frames.merge(fc,on=['frame_id','cell_index']).\
                      merge(fc.rename(columns={'sample_cell_index':'neighbor_sample_cell_index',
                                               'cell_index':'neighbor_cell_index'}),
                            on=['frame_id','neighbor_cell_index'])
[docs]    def frame_iter(self):
        """
        An iterator of frames

        Returns:
            CellFrameGeneric
        """
        for frame_id in self.frame_ids:
            yield self.get_frame(frame_id)

[docs]class CellProjectGeneric(object):
    def __init__(self,h5path,mode='r'):
        """
        Create a CellProjectGeneric object or read from/add to an existing one

        Args:
            h5path (str): path to read/from or store/to
            mode (str): 'r' read, 'a' append, 'w' create/write, 'r+' create/append if necessary
        """
        self._key = None
        self.h5path = h5path
        self.mode = mode
        self._sample_cache_name = None
        self._sample_cache = None
        if mode =='r':
            if not os.path.exists(h5path): raise ValueError("Cannot read a file that does not exist")
        if mode == 'w' or mode == 'r+':
            f = h5py.File(self.h5path,mode)
            if '/samples' not in f.keys():
                f.create_group('/samples')
            if '/meta' not in f.keys():
                dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str))
            else:
                dset = f['/meta']
            dset.attrs['project_name'] = np.nan
            dset.attrs['microns_per_pixel'] = np.nan
            dset.attrs['id'] = uuid4().hex
            f.close()
        return

    def copy(self,path,overwrite=False,output_mode='r'):
        if os.path.exists(path) and overwrite is False: 
            raise ValueError("Cannot overwrite unless overwrite is set to True")
        shutil.copy(self.h5path,path)
        return self.__class__(path,mode=output_mode)


    @classmethod
    def concat(self,path,array_like,overwrite=False,verbose=False):
        if os.path.exists(path) and overwrite is False: 
            raise ValueError("Cannot overwrite unless overwrite is set to True")
        # copy the first 
        arr = [x for x in array_like]
        if len(arr) == 0: raise ValueError("cannot concat empty list")
        if verbose: sys.stderr.write("Copy the first element\n")
        cpi = arr[0].copy(path,output_mode='r+',overwrite=overwrite)
        #shutil.copy(arr[0].h5path,path)
        #cpi = CellProjectGeneric(path,mode='r+')
        if len(arr) == 1: return 
        for project in array_like[1:]:
            if verbose: sys.stderr.write("Add project "+str(project.id)+" "+str(project.project_name)+"\n")
            for s in project.sample_iter():
                if verbose: sys.stderr.write("   Add sample "+str(s.id)+" "+str(s.sample_name)+"\n")
                cpi.append_sample(s)
        return cpi

[docs]    def append_sample(self,sample):
        """
        Append sample to the project

        Args:
            sample (CellSampleGeneric): sample object
        """
        if self.mode == 'r': raise ValueError("Error: cannot write to a path in read-only mode.")
        sample.to_hdf(self.h5path,location='samples/'+sample.id,mode='a')
        
        current = self.key
        if current is None:
            current = pd.DataFrame([{'sample_id':sample.id,
                                     'sample_name':sample.sample_name}])
            current.index.name = 'db_id'
        else:
            iteration = max(current.index)+1
            addition = pd.DataFrame([{'db_id':iteration,
                                      'sample_id':sample.id,
                                      'sample_name':sample.sample_name}]).set_index('db_id')
            current = pd.concat([current,addition])
        current.to_hdf(self.h5path,'info',mode='r+',complib='zlib',complevel=9,format='table')
        return

[docs]    def qc(self,*args,**kwargs):
        """
        Returns:
            QC: QC class to do quality checks
        """
        return QC(self,*args,**kwargs)

    @property
    def id(self):
        """
        Returns the (str) UUID4 string
        """
        f = h5py.File(self.h5path,'r')
        name = f['meta'].attrs['id']
        f.close()
        return name

    @property 
    def project_name(self):
        """
        Return or set the (str) project_name
        """
        f = h5py.File(self.h5path,'r')
        name = f['meta'].attrs['project_name']
        f.close()
        return name
    @project_name.setter
    def project_name(self,name):
        if self.mode == 'r': raise ValueError('cannot write if read only')
        f = h5py.File(self.h5path,'r+')
        f['meta'].attrs['project_name'] = name
        f.close()

    @property 
    def microns_per_pixel(self):
        """
        Return or set the (float) microns_per_pixel
        """
        f = h5py.File(self.h5path,'r')
        name = f['meta'].attrs['microns_per_pixel']
        f.close()
        return name
    @microns_per_pixel.setter
    def microns_per_pixel(self,value):
        if self.mode == 'r': raise ValueError('cannot write if read only')
        f = h5py.File(self.h5path,'r+')
        f['meta'].attrs['microns_per_pixel'] = value
        f.close()

[docs]    def set_id(self,name):
        """
        Set the project ID

        Args:
            name (str): project_id
        """
        if self.mode == 'r': raise ValueError('cannot write if read only')
        f = h5py.File(self.h5path,'r+')
        #dset = f.create_dataset('/meta', (100,), dtype=h5py.special_dtype(vlen=str))
        f['meta'].attrs['id'] = name
        f.close()

    @property
    def cdf(self):
        """
        Return the pythologist.CellDataFrame of the project
        """
        output = []
        for sample_id in self.sample_ids:
            temp = self.get_sample(sample_id).cdf
            temp['project_name'] = self.project_name
            temp['project_id'] = self.id
            output.append(temp)
        output = pd.concat(output).reset_index(drop=True)
        output.index.name = 'db_id'
        cdf = CellDataFrame(pd.DataFrame(output))
        if self.microns_per_pixel: cdf.microns_per_pixel = self.microns_per_pixel
        return cdf

    def cell_df(self):
        samples = []
        for sample_id in self.sample_ids:
            sample = self.get_sample(sample_id).cell_df().reset_index()
            key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
            key_line['key'] = 1
            sample['key'] = 1
            sample = key_line.merge(sample,on='key').drop(columns = 'key')
            samples.append(sample)
        samples = pd.concat(samples).reset_index(drop=True)
        samples.index.name = 'project_cell_index'
        return samples

    def binary_df(self):
        fc = self.cell_df()[['sample_id','frame_id','cell_index']].reset_index()
        samples = []
        for sample_id in self.sample_ids:
            sample = self.get_sample(sample_id).binary_df()
            key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
            key_line['key'] = 1
            sample['key'] = 1
            sample = key_line.merge(sample,on='key').drop(columns = 'key')
            samples.append(sample)
        return fc.merge(pd.concat(samples).reset_index(drop=True),on=['sample_id','frame_id','cell_index'])
    
    def interaction_map(self):
        fc = self.cell_df()[['sample_id','frame_id','cell_index']].reset_index()
        samples = []
        for sample_id in self.sample_ids:
            sample = self.get_sample(sample_id).interaction_map()
            key_line = self.key.set_index('sample_id').loc[[sample_id]].reset_index()
            key_line['key'] = 1
            sample['key'] = 1
            sample = key_line.merge(sample,on='key').drop(columns = 'key')
            samples.append(sample)
        samples = pd.concat(samples).reset_index(drop=True)
        return samples.merge(fc,on=['sample_id','frame_id','cell_index']).\
                       merge(fc.rename(columns={'project_cell_index':'neighbor_project_cell_index',
                                               'cell_index':'neighbor_cell_index'}),
                             on=['sample_id','frame_id','neighbor_cell_index'])


    def create_cell_sample_class(self):
        return CellSampleGeneric()
    
    @property
    def sample_ids(self):
        """
        Return the list of sample_ids
        """
        return sorted(list(self.key['sample_id']))

[docs]    def get_sample(self,sample_id):
        """
        Get the sample_id

        Args:
            sample_id (str): set the sample id
        """
        if self._sample_cache_name == sample_id:
            return self._sample_cache
        sample = self.create_cell_sample_class()
        sample.read_hdf(self.h5path,'samples/'+sample_id)
        self._sample_cache_name = sample_id
        self._sample_cache = sample
        return sample

    @property
    def key(self):
        """
        Get info about the project
        """
        f = h5py.File(self.h5path,'r')
        val = False
        if 'info' in [x for x in f]: val = True
        f.close()
        return None if not val else pd.read_hdf(self.h5path,'info')
    
[docs]    def sample_iter(self):
        """
        An interator of CellSampleGeneric
        """
        for sample_id in self.sample_ids: yield self.get_sample(sample_id)

[docs]    def frame_iter(self):
        """
        An interator of CellFrameGeneric
        """
        for s in self.sample_iter():
            for frame_id in s.frame_ids:
                yield s.get_frame(frame_id)

    @property
    def channel_image_dataframe(self):
        """
        dataframe within info about channels and images
        """
        pname = self.project_name
        pid = self.id
        measurements = []
        for s in self.sample_iter():
            sname = s.sample_name
            sid = s.id
            for f in s.frame_iter():
                fname = f.frame_name
                fid = f.id
                mc = f.get_data('measurement_channels')
                mc['project_name'] = pname
                mc['project_id'] = pid
                mc['sample_name'] = sname
                mc['sample_id'] = sid
                mc['frame_name'] = fname
                mc['frame_id'] = fid
                mc['processed_image_id'] = f.processed_image_id
                measurements.append(mc)
        return pd.concat(measurements).reset_index(drop=True)

[docs]    def get_image(self,sample_id,frame_id,image_id):
        """
        Get an image by sample frame and image id

        Args:
            sample_id (str): unique sample id
            frame_id (str): unique frame id
            image_id (str): unique image id

        Returns:
            numpy.array: 2d image array
        """
        s = self.get_sample(sample_id)
        f = s.get_frame(frame_id)
        return f.get_image(image_id)