Source code for sidekit.bosaris.idmap

# -*- coding: utf-8 -*-

# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
#
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
#
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.

"""
This is the 'idmap' module
"""
import sys
import numpy
import logging
import copy
import h5py

from ..sidekit_wrappers import check_path_existance


__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]


[docs]class IdMap:
    """A class that stores a map between identifiers (strings).  One
    list is called 'leftids' and the other 'rightids'.  The class
    provides methods that convert a sequence of left ids to a
    sequence of right ids and vice versa.  If `leftids` or `rightids`
    contains duplicates then all occurrences are used as the index
    when mapping.

    :attr leftids: a list of classes in a ndarray
    :attr rightids: a list of segments in a ndarray
    :attr start: index of the first frame of the segment
    :attr stop: index of the last frame of the segment
    """

    def __init__(self, idmap_filename=''):
        """Initialize an IdMap object

        :param idmap_filename: name of a file to load. Default is ''.
        In case the idmap_filename is empty, initialize an empty IdMap object.
        """
        self.leftids = numpy.empty(0, dtype="|O")
        self.rightids = numpy.empty(0, dtype="|O")
        self.start = numpy.empty(0, dtype="|O")
        self.stop = numpy.empty(0, dtype="|O")

        if idmap_filename == '':
            pass
        else:
            tmp = IdMap.read(idmap_filename)
            self.leftids = tmp.leftids
            self.rightids = tmp.rightids
            self.start = tmp.start
            self.stop = tmp.stop

    def __repr__(self):
        ch = '-' * 30 + '\n'
        ch += 'left ids:' + self.leftids.__repr__() + '\n'
        ch += 'right ids:' + self.rightids.__repr__() + '\n'
        ch += 'seg start:' + self.start.__repr__() + '\n'
        ch += 'seg stop:' + self.stop.__repr__() + '\n'
        ch += '-' * 30 + '\n'
        return ch

    @check_path_existance
    def write(self, output_file_name):
        """ Save IdMap in HDF5 format

        :param output_file_name: name of the file to write to
        """
        assert self.validate(), "Error: wrong IdMap format"
        with h5py.File(output_file_name, "w") as f:
            f.create_dataset("leftids", data=self.leftids.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("rightids", data=self.rightids.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            # WRITE START and STOP
            start = copy.deepcopy(self.start)
            start[numpy.isnan(self.start.astype('float'))] = -1
            start = start.astype('int32', copy=False)

            stop = copy.deepcopy(self.stop)
            stop[numpy.isnan(self.stop.astype('float'))] = -1
            stop = stop.astype('int32', copy=False)

            f.create_dataset("start", data=start,
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("stop", data=stop,
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)

    @check_path_existance
    def write_txt(self, output_file_name):
        """Saves the Id_Map to a text file.
        
        :param output_file_name: name of the output text file
        """
        with open(output_file_name, 'w') as outputFile:
            for left, right, start, stop in zip(self.leftids, self.rightids, self.start, self.stop):
                line = ' '.join(filter(None, (left, right, str(start), str(stop)))) + '\n'
                outputFile.write(line)

[docs]    def map_left_to_right(self, leftidlist):
        """Maps an array of ids to a new array of ids using the given map.  
        The input ids are matched against the leftids of the map and the
        output ids are taken from the corresponding rightids of the map.
        
        Beware: if leftids are not unique in the IdMap, only the last value 
        corresponding is kept

        :param leftidlist: an array of strings to be matched against the
            leftids of the idmap.  The rightids corresponding to these
            leftids will be returned.

        :return: an array of strings that are the mappings of the
            strings in leftidlist.
        """
        tmp_dict = dict(zip(self.leftids, self.rightids))
        inter = numpy.intersect1d(self.leftids, leftidlist)
        rightids = numpy.empty(inter.shape[0], '|O')
        
        idx = 0
        for left in leftidlist:
            if left in inter:
                rightids[idx] = tmp_dict[left]
                idx += 1

        lost_ids = numpy.unique(leftidlist).shape[0] - inter.shape[0]
        if lost_ids:
            logging.warning('{} ids could not be mapped'.format(lost_ids))

        return rightids

[docs]    def map_right_to_left(self, rightidlist):
        """Maps an array of ids to a new array of ids using the given map.  
        The input ids are matched against the rightids of the map and the
        output ids are taken from the corresponding leftids of the map.

        Beware: if rightids are not unique in the IdMap, only the last value 
        corresponding is kept

        :param rightidlist: An array of strings to be matched against the
            rightids of the idmap.  The leftids corresponding to these
            rightids will be returned.

        :return: an array of strings that are the mappings of the
            strings in rightidlist.
        """
        tmp_dict = dict(zip(self.rightids, self.leftids))
        inter = numpy.intersect1d(self.rightids, rightidlist)
        leftids = numpy.empty(inter.shape[0], '|O')
        
        idx = 0
        for right in rightidlist:
            if right in inter:
                leftids[idx] = tmp_dict[right]
                idx += 1        
        
        lost_ids = numpy.unique(rightidlist).shape[0] - inter.shape[0]
        if lost_ids:
            logging.warning('{} ids could not be mapped'.format(lost_ids))

        return leftids

[docs]    def filter_on_left(self, idlist, keep):
        """Removes some of the information in an idmap.  Depending on the
        value of 'keep', the idlist indicates the strings to retain or
        the strings to discard.

        :param idlist: an array of strings which will be compared with
            the leftids of the current.
        :param keep: A boolean indicating whether idlist contains the ids to
            keep or to discard.

        :return: a filtered version of the current IdMap.
        """
        # get the list of ids to keep
        if keep:
            keepids = numpy.unique(idlist)
        else:
            keepids = numpy.setdiff1d(self.leftids, idlist)
        
        keep_idx = numpy.in1d(self.leftids, keepids)
        out_idmap = IdMap()
        out_idmap.leftids = self.leftids[keep_idx]
        out_idmap.rightids = self.rightids[keep_idx]
        out_idmap.start = self.start[keep_idx]
        out_idmap.stop = self.stop[keep_idx]
        
        return out_idmap

[docs]    def filter_on_right(self, idlist, keep):
        """Removes some of the information in an idmap.  Depending on the
        value of 'keep', the idlist indicates the strings to retain or
        the strings to discard.

        :param idlist: an array of strings which will be compared with
            the rightids of the current IdMap.
        :param keep: a boolean indicating whether idlist contains the ids to
            keep or to discard.

        :return: a filtered version of the current IdMap.
        """
        # get the list of ids to keep
        if keep:
            keepids = numpy.unique(idlist)
        else:
            keepids = numpy.setdiff1d(self.rightids, idlist)
        
        keep_idx = numpy.in1d(self.rightids, keepids)
        out_idmap = IdMap()
        out_idmap.leftids = self.leftids[keep_idx]
        out_idmap.rightids = self.rightids[keep_idx]
        out_idmap.start = self.start[keep_idx]
        out_idmap.stop = self.stop[keep_idx]
        return out_idmap

[docs]    def validate(self, warn=False):
        """Checks that an object of type Id_Map obeys certain rules that
        must alows be true.
        
        :param warn: boolean. If True, print a warning if strings are
            duplicated in either left or right array

        :return: a boolean value indicating whether the object is valid.

        """
        ok = (self.leftids.shape == self.rightids.shape == self.start.shape == self.stop.shape) & self.leftids.ndim == 1

        if warn & (self.leftids.shape != numpy.unique(self.leftids).shape):
            logging.warning('The left id list contains duplicate identifiers')
        if warn & (self.rightids.shape != numpy.unique(self.rightids).shape):
            logging.warning('The right id list contains duplicate identifiers')
        return ok

    def set(self, left, right, start=None, stop=None):
        self.leftids = copy.deepcopy(left)
        self.rightids = copy.deepcopy(right)

        if start is not None:
            self.start = copy.deepcopy(start)
        else:
            self.start = numpy.empty(self.rightids.shape, '|O')

        if stop is not None:
            self.stop = copy.deepcopy(stop)
        else:
            self.stop = numpy.empty(self.rightids.shape, '|O')

[docs]    @staticmethod
    def read(input_file_name):
        """Read IdMap in hdf5 format.

        :param input_file_name: name of the file to read from
        """
        with h5py.File(input_file_name, "r") as f:
            idmap = IdMap()

            idmap.leftids = f.get("leftids")[()]
            idmap.rightids = f.get("rightids")[()]

            # if running python 3, need a conversion to unicode
            if sys.version_info[0] == 3:
                idmap.leftids = idmap.leftids.astype('U255', copy=False)
                idmap.rightids = idmap.rightids.astype('U255', copy=False)

            tmpstart = f.get("start")[()]
            tmpstop = f.get("stop")[()]
            idmap.start = numpy.empty(f["start"].shape, '|O')
            idmap.stop = numpy.empty(f["stop"].shape, '|O')
            idmap.start[tmpstart != -1] = tmpstart[tmpstart != -1]
            idmap.stop[tmpstop != -1] = tmpstop[tmpstop != -1]

            assert idmap.validate(), "Error: wrong IdMap format"
            return idmap

    @classmethod
    @check_path_existance
    def read_txt(cls, input_file_name):
        """Read IdMap in text format.

        :param input_file_name: name of the file to read from
        """
        idmap = IdMap()

        with open(input_file_name, "r") as f:
            columns = len(f.readline().split(' '))

        if columns == 2:
            idmap.leftids, idmap.rightids = numpy.loadtxt(input_file_name,
                                                          dtype={'names': ('left', 'right'), 'formats': ('|O', '|O')},
                                                          usecols=(0, 1), unpack=True)
            idmap.start = numpy.empty(idmap.rightids.shape, '|O')
            idmap.stop = numpy.empty(idmap.rightids.shape, '|O')
        
        # If four columns
        elif columns == 4:
            idmap.leftids, idmap.rightids, idmap.start, idmap.stop = numpy.loadtxt(
                input_file_name,
                dtype={'names': ('left', 'right', 'start', 'stop'),
                       'formats': ('|O', '|O', 'int', 'int')}, unpack=True)
    
        if not idmap.validate():
            raise Exception('Wrong format of IdMap')
        assert idmap.validate(), "Error: wrong IdMap format"
        return idmap

[docs]    def merge(self, idmap2):
        """ Merges the current IdMap with another IdMap or a list of IdMap objects..

        :param idmap2: Another Id_Map object.

        :return: an Id_Map object that contains the information from the two
            input Id_Maps.
        """
        idmap = IdMap()
        if self.validate() & idmap2.validate():
            # create tuples of (model,seg) for both IdMaps for quick comparaison
            tup1 = [(mod, seg) for mod, seg in zip(self.leftids, self.rightids)]
            tup2 = [(mod, seg) for mod, seg in zip(idmap2.leftids, idmap2.rightids)]

            # Get indices of common sessions
            existing_sessions = set(tup1).intersection(set(tup2))
            # Get indices of sessions which are not common in idmap2
            idx_new = numpy.sort(numpy.array([idx for idx, sess in enumerate(tup2) if sess not in tup1]))
            if len(idx_new) == 0:
                idx_new = numpy.zeros(idmap2.leftids.shape[0], dtype='bool')

            idmap.leftids = numpy.concatenate((self.leftids, idmap2.leftids[idx_new]), axis=0)
            idmap.rightids = numpy.concatenate((self.rightids, idmap2.rightids[idx_new]), axis=0)
            idmap.start = numpy.concatenate((self.start, idmap2.start[idx_new]), axis=0)
            idmap.stop = numpy.concatenate((self.stop, idmap2.stop[idx_new]), axis=0)

        else:
            raise Exception('Cannot merge IdMaps, wrong type')

        if not idmap.validate():
            raise Exception('Wrong format of IdMap')

        return idmap

[docs]    def split(self, N):
        """
        Split an IdMap object into N IdMap of same size (if possible)

        :param N: the number of IdMap to generate
        :return: a list of IdMap
        """
        session_nb = self.leftids.shape[0]
        sub_indices = numpy.array_split(numpy.arange(session_nb), N)

        im_list = []
        for ii in range(N):
            im_list.append(IdMap())
            im_list[ii].leftids = self.leftids[sub_indices[ii]]
            im_list[ii].rightids = self.rightids[sub_indices[ii]]
            im_list[ii].start = self.start[sub_indices[ii]]
            im_list[ii].stop = self.stop[sub_indices[ii]]
            assert im_list[ii].validate(), "Error: wrong IdMap format"

        return im_list