Source code for sidekit.bosaris.ndx

# -*- coding: utf-8 -*-

# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
#
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
#
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.

"""
This is the 'ndx' module
"""
import h5py
import logging
import numpy
import sys
from ..sidekit_wrappers import check_path_existance, deprecated

__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]


def diff(list1, list2):
    c = [item for item in list1 if item not in list2]
    c.sort()
    return c


def ismember(list1, list2):
    c = [item in list2 for item in list1]
    return c


[docs]class Ndx:
    """A class that encodes trial index information.  It has a list of
    model names and a list of test segment names and a matrix
    indicating which combinations of model and test segment are
    trials of interest.
    
    :attr modelset: list of unique models in a ndarray
    :attr segset:  list of unique test segments in a ndarray
    :attr trialmask: 2D ndarray of boolean. Rows correspond to the models 
            and columns to the test segments. True if the trial is of interest.
    """

    def __init__(self, ndx_file_name='',
                 models=numpy.array([]),
                 testsegs=numpy.array([])):
        """Initialize a Ndx object by loading information from a file
        in HDF5 or text format.

        :param ndx_file_name: name of the file to load
        """
        self.modelset = numpy.empty(0, dtype="|O")
        self.segset = numpy.empty(0, dtype="|O")
        self.trialmask = numpy.array([], dtype="bool")

        if ndx_file_name == '':
            modelset = numpy.unique(models)
            segset = numpy.unique(testsegs)
    
            trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
            for m in range(modelset.shape[0]):
                segs = testsegs[numpy.array(ismember(models, modelset[m]))]
                trialmask[m, ] = ismember(segset, segs)
    
            self.modelset = modelset
            self.segset = segset
            self.trialmask = trialmask
            assert self.validate(), "Wrong Ndx format"

        else:
            ndx = Ndx.read(ndx_file_name)
            self.modelset = ndx.modelset
            self.segset = ndx.segset
            self.trialmask = ndx.trialmask

    @check_path_existance
    def write(self, output_file_name):
        """ Save Ndx object in HDF5 format

        :param output_file_name: name of the file to write to
        """
        assert self.validate(), "Error: wrong Ndx format"

        with h5py.File(output_file_name, "w") as f:
            f.create_dataset("modelset", data=self.modelset.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("segset", data=self.segset.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("trial_mask", data=self.trialmask.astype('int8'),
                             maxshape=(None, None),
                             compression="gzip",
                             fletcher32=True)

    @check_path_existance
    def save_txt(self, output_file_name):

        """Save a Ndx object in a text file

        :param output_file_name: name of the file to write to
        """
        fid = open(output_file_name, 'w')
        for m in range(self.modelset.shape[0]):
            segs = self.segset[self.trialmask[m, ]]
            for s in segs:
                fid.write('{} {}\n'.format(self.modelset[m], s))
        fid.close()

[docs]    def filter(self, modlist, seglist, keep):
        """Removes some of the information in an Ndx. Useful for creating a
        gender specific Ndx from a pooled gender Ndx.  Depending on the
        value of \'keep\', the two input lists indicate the strings to
        retain or the strings to discard.

        :param modlist: a cell array of strings which will be compared with
                the modelset of 'inndx'.
        :param seglist: a cell array of strings which will be compared with
                the segset of 'inndx'.
        :param keep: a boolean indicating whether modlist and seglist are the
                models to keep or discard.

        :return: a filtered version of the current Ndx object.
        """
        if keep:
            keepmods = modlist
            keepsegs = seglist
        else:
            keepmods = diff(self.modelset, modlist)
            keepsegs = diff(self.segset, seglist)

        keepmodidx = numpy.array(ismember(self.modelset, keepmods))
        keepsegidx = numpy.array(ismember(self.segset, keepsegs))

        outndx = Ndx()
        outndx.modelset = self.modelset[keepmodidx]
        outndx.segset = self.segset[keepsegidx]
        tmp = self.trialmask[numpy.array(keepmodidx), :]
        outndx.trialmask = tmp[:, numpy.array(keepsegidx)]

        assert outndx.validate, "Wrong Ndx format"

        if self.modelset.shape[0] > outndx.modelset.shape[0]:
            logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outndx.modelset.shape[0])
        if self.segset.shape[0] > outndx.segset.shape[0]:
            logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outndx.segset.shape[0])
        return outndx

[docs]    def validate(self):
        """Checks that an object of type Ndx obeys certain rules that
        must always be true.

        :return: a boolean value indicating whether the object is valid
        """
        ok = isinstance(self.modelset, numpy.ndarray)
        ok &= isinstance(self.segset, numpy.ndarray)
        ok &= isinstance(self.trialmask, numpy.ndarray)

        ok &= (self.modelset.ndim == 1)
        ok &= (self.segset.ndim == 1)
        ok &= (self.trialmask.ndim == 2)

        ok &= (self.trialmask.shape == (self.modelset.shape[0], self.segset.shape[0]))
        return ok

[docs]    @staticmethod
    def read(input_file_name):
        """Creates an Ndx object from the information in an hdf5 file.

        :param input_file_name: name of the file to read from
        """
        with h5py.File(input_file_name, "r") as f:
            ndx = Ndx()
            ndx.modelset = f.get("modelset")[()]
            ndx.segset = f.get("segset")[()]

            # if running python 3, need a conversion to unicode
            if sys.version_info[0] == 3:
                ndx.modelset = ndx.modelset.astype('U100', copy=False)
                ndx.segset = ndx.segset.astype('U100', copy=False)

            ndx.trialmask = f.get("trial_mask")[()].astype('bool')

            assert ndx.validate(), "Error: wrong Ndx format"
            return ndx

    @classmethod
    @check_path_existance
    def read_txt(cls, input_filename):
        """Creates an Ndx object from information stored in a text file.

        :param input_filename: name of the file to read from
        """
        ndx = Ndx()

        with open(input_filename, 'r') as fid:
            lines = [l.rstrip().split() for l in fid]

        models = numpy.empty(len(lines), '|O')
        testsegs = numpy.empty(len(lines), '|O')
        for ii in range(len(lines)):
            models[ii] = lines[ii][0]
            testsegs[ii] = lines[ii][1]

        modelset = numpy.unique(models)
        segset = numpy.unique(testsegs)

        trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
        for m in range(modelset.shape[0]):
            segs = testsegs[numpy.array(ismember(models, modelset[m]))]
            trialmask[m, ] = ismember(segset, segs)

        ndx.modelset = modelset
        ndx.segset = segset
        ndx.trialmask = trialmask

        assert ndx.validate(), "Wrong Ndx format"
        return ndx

[docs]    def merge(self, ndx_list):
        """Merges a list of Ndx objects into the current one.
        The resulting ndx must have all models and segment in the input
        ndxs (only once).  A trial in any ndx becomes a trial in the
        output ndx

        :param ndx_list: list of Ndx objects to merge
        """
        assert isinstance(ndx_list, list), "Input is not a list"
        for ndx in ndx_list:
            assert isinstance(ndx_list, list), \
                '{} {} {}'.format("Element ", ndx, " is not an Ndx")

        self.validate()
        for ndx2 in ndx_list:
            ndx_new = Ndx()
            ndx1 = self

            # create new ndx with empty masks
            ndx_new.modelset = numpy.union1d(ndx1.modelset, ndx2.modelset)
            ndx_new.segset = numpy.union1d(ndx1.segset, ndx2.segset)

            # expand ndx1 mask
            trials_1 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
            model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx1.modelset))
            model_index_b = numpy.argwhere(numpy.in1d(ndx1.modelset, ndx_new.modelset))
            seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx1.segset))
            seg_index_b = numpy.argwhere(numpy.in1d(ndx1.segset, ndx_new.segset))
            trials_1[model_index_a[:, None], seg_index_a] = ndx1.trialmask[model_index_b[:, None], seg_index_b]

            # expand ndx2 mask
            trials_2 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
            model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx2.modelset))
            model_index_b = numpy.argwhere(numpy.in1d(ndx2.modelset, ndx_new.modelset))
            seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx2.segset))
            seg_index_b = numpy.argwhere(numpy.in1d(ndx2.segset, ndx_new.segset))
            trials_2[model_index_a[:, None], seg_index_a] = ndx2.trialmask[model_index_b[:, None], seg_index_b]

            # merge masks
            trials = trials_1 | trials_2

            # build new ndx
            ndx_new.trialmask = trials
            self.modelset = ndx_new.modelset
            self.segset = ndx_new.segset
            self.trialmask = ndx_new.trialmask