Source code for sidekit.bosaris.key

# -*- coding: utf-8 -*-

# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
#
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
#
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.

"""
This is the 'key' module
"""
import h5py
import logging
import numpy
import sys
from .ndx import Ndx
from ..sidekit_wrappers import check_path_existance

__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]


def diff(list1, list2):
    c = [item for item in list1 if item not in list2]
    c.sort()
    return c


def ismember(list1, list2):
    c = [item in list2 for item in list1]
    return c


[docs]class Key:
    """A class for representing a Key i.e. it classifies trials as                                                          
    target or non-target trials.

    :attr modelset: list of the models into a ndarray of strings
    :attr segset: list of the test segments into a ndarray of strings
    :attr tar: 2D ndarray of booleans which rows correspond to the models 
            and columns to the test segments. True if target trial.
    :attr non: 2D ndarray of booleans which rows correspond to the models 
            and columns to the test segments. True is non-target trial.
    """

    def __init__(self, key_file_name='',
                 models=numpy.array([]),
                 testsegs=numpy.array([]),
                 trials=numpy.array([])):
        """Initialize a Key object.
        :param key_file_name: name of the file to load. Default is ''.
        :param models: a list of models
        :param testsegs: a list of test segments
        
        In case the key_file_name is empty, initialize an empty Key object.
        """
        self.modelset = numpy.empty(0, dtype="|O")
        self.segset = numpy.empty(0, dtype="|O")
        self.tar = numpy.array([], dtype="bool")
        self.non = numpy.array([], dtype="bool")

        if key_file_name == '':
            modelset = numpy.unique(models)
            segset = numpy.unique(testsegs)
    
            tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
            non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")

            for idx_m, model in enumerate(modelset):
                idx_current_model = numpy.argwhere(models == model).flatten()
                current_model_keys = dict(zip(testsegs[idx_current_model], 
                                              trials[idx_current_model]))
                for idx_s, seg in enumerate(segset):
                    if seg in current_model_keys:
                        tar[idx_m, idx_s] = (current_model_keys[seg] == 'target')
                        non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget')
    
            self.modelset = modelset
            self.segset = segset
            self.tar = tar
            self.non = non
            assert self.validate(), "Wrong Key format"            

        else:
            tmp = self.read(key_file_name)
            self.modelset = tmp.modelset
            self.segset = tmp.segset
            self.tar = tmp.tar
            self.non = tmp.non

    @check_path_existance
    def write(self, output_file_name):
        """ Save Key in HDF5 format

        :param output_file_name: name of the file to write to
        """
        assert self.validate(), "Error: wrong Key format"

        with h5py.File(output_file_name, "w") as f:
            f.create_dataset("modelset", data=self.modelset.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            f.create_dataset("segset", data=self.segset.astype('S'),
                             maxshape=(None,),
                             compression="gzip",
                             fletcher32=True)
            trialmask = numpy.array(self.tar, dtype='int8') - numpy.array(self.non, dtype='int8')
            f.create_dataset("trial_mask", data=trialmask,
                             maxshape=(None, None),
                             compression="gzip",
                             fletcher32=True)

    @check_path_existance
    def write_txt(self, output_file_name):
        """Save a Key object to a text file.

        :param output_file_name: name of the output text file
        """
        fid = open(output_file_name, 'w')
        for m in range(self.modelset.shape[0]):
            segs = self.segset[self.tar[m, ]]
            for s in range(segs.shape[0]):
                fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'target'))
            segs = self.segset[self.non[m, ]]
            for s in range(segs.shape[0]):
                fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'nontarget'))
        fid.close()

[docs]    def filter(self, modlist, seglist, keep):
        """Removes some of the information in a key.  Useful for creating a
        gender specific key from a pooled gender key.  Depending on the
        value of \'keep\', the two input lists indicate the strings to
        retain or the strings to discard.

        :param modlist: a cell array of strings which will be compared with
            the modelset of 'inkey'.
        :param seglist: a cell array of strings which will be compared with
            the segset of 'inkey'.
        :param keep: a boolean indicating whether modlist and seglist are the
            models to keep or discard.

        :return: a filtered version of 'inkey'.
        """
        if keep:
            keepmods = modlist
            keepsegs = seglist
        else:
            keepmods = diff(self.modelset, modlist)
            keepsegs = diff(self.segset, seglist)

        keepmodidx = numpy.array(ismember(self.modelset, keepmods))
        keepsegidx = numpy.array(ismember(self.segset, keepsegs))

        outkey = Key()
        outkey.modelset = self.modelset[keepmodidx]
        outkey.segset = self.segset[keepsegidx]
        tmp = self.tar[numpy.array(keepmodidx), :]
        outkey.tar = tmp[:, numpy.array(keepsegidx)]
        tmp = self.non[numpy.array(keepmodidx), :]
        outkey.non = tmp[:, numpy.array(keepsegidx)]

        assert(outkey.validate())

        if self.modelset.shape[0] > outkey.modelset.shape[0]:
            logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outkey.modelset.shape[0])
        if self.segset.shape[0] > outkey.segset.shape[0]:
            logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outkey.segset.shape[0])
        return outkey

[docs]    def to_ndx(self):
        """Create a Ndx object based on the Key object

        :return: a Ndx object based on the Key
        """
        ndx = Ndx()
        ndx.modelset = self.modelset
        ndx.segset = self.segset
        ndx.trialmask = self.tar | self.non
        return ndx

[docs]    def validate(self):
        """Checks that an object of type Key obeys certain rules that
        must always be true.

        :return: a boolean value indicating whether the object is valid.
        """
        ok = isinstance(self.modelset, numpy.ndarray)
        ok &= isinstance(self.segset, numpy.ndarray)
        ok &= isinstance(self.tar, numpy.ndarray)
        ok &= isinstance(self.non, numpy.ndarray)
        ok &= self.modelset.ndim == 1
        ok &= self.segset.ndim == 1
        ok &= self.tar.ndim == 2
        ok &= self.non.ndim == 2
        ok &= self.tar.shape == self.non.shape
        ok &= self.tar.shape[0] == self.modelset.shape[0]
        ok &= self.tar.shape[1] == self.segset.shape[0]
        return ok

[docs]    @staticmethod
    def read(input_file_fame):
        """Reads a Key object from an hdf5 file.
  
        :param input_file_fame: name of the file to read from
        """
        with h5py.File(input_file_fame, "r") as f:

            key = Key()
            key.modelset = f.get("modelset")[()]
            key.segset = f.get("segset")[()]

            # if running python 3, need a conversion to unicode
            if sys.version_info[0] == 3:
                key.modelset = key.modelset.astype('U100', copy=False)
                key.segset = key.segset.astype('U100', copy=False)

            trialmask = f.get("trial_mask")[()]
            key.tar = (trialmask == 1)
            key.non = (trialmask == -1)

            assert key.validate(), "Error: wrong Key format"
            return key

[docs]    @staticmethod
    def read_txt(input_file_name):
        """Creates a Key object from information stored in a text file.

            :param input_file_name: name of the file to read from
        """
        key = Key()

        models, testsegs, trial = numpy.loadtxt(input_file_name,
                                                delimiter=' ',
                                                dtype={'names': ('mod', 'seg', 'key'),
                                                       'formats': ('S1000', 'S1000', 'S10')},
                                                unpack=True)

        models = models.astype('|O', copy=False).astype('S', copy=False)
        testsegs = testsegs.astype('|O', copy=False).astype('S', copy=False)
        trial = trial.astype('|O', copy=False).astype('S', copy=False)

        if sys.version_info[0] == 3:
            models = models.astype('U', copy=False)
            testsegs = testsegs.astype('U', copy=False)
            trial = trial.astype('U', copy=False)

        modelset = numpy.unique(models)
        segset = numpy.unique(testsegs)

        tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
        non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")

        for idx_m, model in enumerate(modelset):
            idx_current_model = numpy.argwhere(models == model).flatten()
            current_model_keys = dict(zip(testsegs[idx_current_model], trial[idx_current_model]))
            for idx_s, seg in enumerate(segset):
                if seg in current_model_keys:
                    tar[idx_m, idx_s] = (current_model_keys[seg] == 'target')
                    non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget')

        key.modelset = modelset
        key.segset = segset
        key.tar = tar
        key.non = non
        assert key.validate(), "Wrong Key format"
        return key

[docs]    def merge(self, key_list):
        """Merges Key objects. This function takes as input a list of
        Key objects to merge in the curent one.

        :param key_list: the list of Keys to merge
        """
        # the output key must have all models and segment in the input
        # keys (only once) and the same target and non-target trials.
        # It is an error if a trial is a target in one key and a
        # non-target in another, but a target or non-target marker will
        # override a 'non-trial' marker.
        assert isinstance(key_list, list), "Input is not a list"
        for key in key_list:
            assert isinstance(key_list, list), \
                    '{} {} {}'.format("Element ", key, " is not a list")

        for key2 in key_list:
            key_new = Key()
            key1 = self

            # create new ndx with empty masks
            key_new.modelset = numpy.union1d(key1.modelset, key2.modelset)
            key_new.segset = numpy.union1d(key1.segset, key2.segset)

            # expand ndx1 mask
            tar_1 = numpy.zeros((key_new.modelset.shape[0],
                                key_new.segset.shape[0]),
                                dtype="bool")
            non_1 = numpy.zeros((key_new.modelset.shape[0],
                                key_new.segset.shape[0]), dtype="bool")
            model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key1.modelset))
            model_index_b = numpy.argwhere(numpy.in1d(key1.modelset, key_new.modelset))
            seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key1.segset))
            seg_index_b = numpy.argwhere(numpy.in1d(key1.segset, key_new.segset))
            tar_1[model_index_a[:, None], seg_index_a] = key1.tar[model_index_b[:, None], seg_index_b]
            non_1[model_index_a[:, None], seg_index_a] = key1.non[model_index_b[:, None], seg_index_b]

            # expand ndx2 mask
            tar_2 = numpy.zeros((key_new.modelset.shape[0],
                                key_new.segset.shape[0]), dtype="bool")
            non_2 = numpy.zeros((key_new.modelset.shape[0],
                                key_new.segset.shape[0]), dtype="bool")
            model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key2.modelset))
            model_index_b = numpy.argwhere(numpy.in1d(key2.modelset, key_new.modelset))
            seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key2.segset))
            seg_index_b = numpy.argwhere(numpy.in1d(key2.segset, key_new.segset))
            tar_2[model_index_a[:, None], seg_index_a] = key2.tar[model_index_b[:, None], seg_index_b]
            non_2[model_index_a[:, None], seg_index_a] = key2.non[model_index_b[:, None], seg_index_b]

            # merge masks
            tar = tar_1 | tar_2
            non = non_1 | non_2

            # check for clashes
            assert numpy.sum(tar & non) == 0, "Conflict in the new Key"

            # build new key
            key_new.tar = tar
            key_new.non = non
            self.modelset = key_new.modelset
            self.segset = key_new.segset
            self.tar = key_new.tar
            self.non = key_new.non
            self.validate()