Source code for sidekit.bosaris.key

# -*- coding: utf-8 -*-

# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
#
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
#
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.

"""
This is the 'key' module
"""
import h5py
import logging
import numpy
import sys
from .ndx import Ndx
from ..sidekit_wrappers import check_path_existance

__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]


def diff(list1, list2):
    c = [item for item in list1 if item not in list2]
    c.sort()
    return c


def ismember(list1, list2):
    c = [item in list2 for item in list1]
    return c


[docs]class Key: """A class for representing a Key i.e. it classifies trials as target or non-target trials. :attr modelset: list of the models into a ndarray of strings :attr segset: list of the test segments into a ndarray of strings :attr tar: 2D ndarray of booleans which rows correspond to the models and columns to the test segments. True if target trial. :attr non: 2D ndarray of booleans which rows correspond to the models and columns to the test segments. True is non-target trial. """ def __init__(self, key_file_name='', models=numpy.array([]), testsegs=numpy.array([]), trials=numpy.array([])): """Initialize a Key object. :param key_file_name: name of the file to load. Default is ''. :param models: a list of models :param testsegs: a list of test segments In case the key_file_name is empty, initialize an empty Key object. """ self.modelset = numpy.empty(0, dtype="|O") self.segset = numpy.empty(0, dtype="|O") self.tar = numpy.array([], dtype="bool") self.non = numpy.array([], dtype="bool") if key_file_name == '': modelset = numpy.unique(models) segset = numpy.unique(testsegs) tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") for idx_m, model in enumerate(modelset): idx_current_model = numpy.argwhere(models == model).flatten() current_model_keys = dict(zip(testsegs[idx_current_model], trials[idx_current_model])) for idx_s, seg in enumerate(segset): if seg in current_model_keys: tar[idx_m, idx_s] = (current_model_keys[seg] == 'target') non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget') self.modelset = modelset self.segset = segset self.tar = tar self.non = non assert self.validate(), "Wrong Key format" else: tmp = self.read(key_file_name) self.modelset = tmp.modelset self.segset = tmp.segset self.tar = tmp.tar self.non = tmp.non @check_path_existance def write(self, output_file_name): """ Save Key in HDF5 format :param output_file_name: name of the file to write to """ assert self.validate(), "Error: wrong Key format" with h5py.File(output_file_name, "w") as f: f.create_dataset("modelset", data=self.modelset.astype('S'), maxshape=(None,), compression="gzip", fletcher32=True) f.create_dataset("segset", data=self.segset.astype('S'), maxshape=(None,), compression="gzip", fletcher32=True) trialmask = numpy.array(self.tar, dtype='int8') - numpy.array(self.non, dtype='int8') f.create_dataset("trial_mask", data=trialmask, maxshape=(None, None), compression="gzip", fletcher32=True) @check_path_existance def write_txt(self, output_file_name): """Save a Key object to a text file. :param output_file_name: name of the output text file """ fid = open(output_file_name, 'w') for m in range(self.modelset.shape[0]): segs = self.segset[self.tar[m, ]] for s in range(segs.shape[0]): fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'target')) segs = self.segset[self.non[m, ]] for s in range(segs.shape[0]): fid.write('{} {} {}\n'.format(self.modelset[m], segs[s], 'nontarget')) fid.close()
[docs] def filter(self, modlist, seglist, keep): """Removes some of the information in a key. Useful for creating a gender specific key from a pooled gender key. Depending on the value of \'keep\', the two input lists indicate the strings to retain or the strings to discard. :param modlist: a cell array of strings which will be compared with the modelset of 'inkey'. :param seglist: a cell array of strings which will be compared with the segset of 'inkey'. :param keep: a boolean indicating whether modlist and seglist are the models to keep or discard. :return: a filtered version of 'inkey'. """ if keep: keepmods = modlist keepsegs = seglist else: keepmods = diff(self.modelset, modlist) keepsegs = diff(self.segset, seglist) keepmodidx = numpy.array(ismember(self.modelset, keepmods)) keepsegidx = numpy.array(ismember(self.segset, keepsegs)) outkey = Key() outkey.modelset = self.modelset[keepmodidx] outkey.segset = self.segset[keepsegidx] tmp = self.tar[numpy.array(keepmodidx), :] outkey.tar = tmp[:, numpy.array(keepsegidx)] tmp = self.non[numpy.array(keepmodidx), :] outkey.non = tmp[:, numpy.array(keepsegidx)] assert(outkey.validate()) if self.modelset.shape[0] > outkey.modelset.shape[0]: logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outkey.modelset.shape[0]) if self.segset.shape[0] > outkey.segset.shape[0]: logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outkey.segset.shape[0]) return outkey
[docs] def to_ndx(self): """Create a Ndx object based on the Key object :return: a Ndx object based on the Key """ ndx = Ndx() ndx.modelset = self.modelset ndx.segset = self.segset ndx.trialmask = self.tar | self.non return ndx
[docs] def validate(self): """Checks that an object of type Key obeys certain rules that must always be true. :return: a boolean value indicating whether the object is valid. """ ok = isinstance(self.modelset, numpy.ndarray) ok &= isinstance(self.segset, numpy.ndarray) ok &= isinstance(self.tar, numpy.ndarray) ok &= isinstance(self.non, numpy.ndarray) ok &= self.modelset.ndim == 1 ok &= self.segset.ndim == 1 ok &= self.tar.ndim == 2 ok &= self.non.ndim == 2 ok &= self.tar.shape == self.non.shape ok &= self.tar.shape[0] == self.modelset.shape[0] ok &= self.tar.shape[1] == self.segset.shape[0] return ok
[docs] @staticmethod def read(input_file_fame): """Reads a Key object from an hdf5 file. :param input_file_fame: name of the file to read from """ with h5py.File(input_file_fame, "r") as f: key = Key() key.modelset = f.get("modelset")[()] key.segset = f.get("segset")[()] # if running python 3, need a conversion to unicode if sys.version_info[0] == 3: key.modelset = key.modelset.astype('U100', copy=False) key.segset = key.segset.astype('U100', copy=False) trialmask = f.get("trial_mask")[()] key.tar = (trialmask == 1) key.non = (trialmask == -1) assert key.validate(), "Error: wrong Key format" return key
[docs] @staticmethod def read_txt(input_file_name): """Creates a Key object from information stored in a text file. :param input_file_name: name of the file to read from """ key = Key() models, testsegs, trial = numpy.loadtxt(input_file_name, delimiter=' ', dtype={'names': ('mod', 'seg', 'key'), 'formats': ('S1000', 'S1000', 'S10')}, unpack=True) models = models.astype('|O', copy=False).astype('S', copy=False) testsegs = testsegs.astype('|O', copy=False).astype('S', copy=False) trial = trial.astype('|O', copy=False).astype('S', copy=False) if sys.version_info[0] == 3: models = models.astype('U', copy=False) testsegs = testsegs.astype('U', copy=False) trial = trial.astype('U', copy=False) modelset = numpy.unique(models) segset = numpy.unique(testsegs) tar = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") non = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool") for idx_m, model in enumerate(modelset): idx_current_model = numpy.argwhere(models == model).flatten() current_model_keys = dict(zip(testsegs[idx_current_model], trial[idx_current_model])) for idx_s, seg in enumerate(segset): if seg in current_model_keys: tar[idx_m, idx_s] = (current_model_keys[seg] == 'target') non[idx_m, idx_s] = (current_model_keys[seg] == 'nontarget') key.modelset = modelset key.segset = segset key.tar = tar key.non = non assert key.validate(), "Wrong Key format" return key
[docs] def merge(self, key_list): """Merges Key objects. This function takes as input a list of Key objects to merge in the curent one. :param key_list: the list of Keys to merge """ # the output key must have all models and segment in the input # keys (only once) and the same target and non-target trials. # It is an error if a trial is a target in one key and a # non-target in another, but a target or non-target marker will # override a 'non-trial' marker. assert isinstance(key_list, list), "Input is not a list" for key in key_list: assert isinstance(key_list, list), \ '{} {} {}'.format("Element ", key, " is not a list") for key2 in key_list: key_new = Key() key1 = self # create new ndx with empty masks key_new.modelset = numpy.union1d(key1.modelset, key2.modelset) key_new.segset = numpy.union1d(key1.segset, key2.segset) # expand ndx1 mask tar_1 = numpy.zeros((key_new.modelset.shape[0], key_new.segset.shape[0]), dtype="bool") non_1 = numpy.zeros((key_new.modelset.shape[0], key_new.segset.shape[0]), dtype="bool") model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key1.modelset)) model_index_b = numpy.argwhere(numpy.in1d(key1.modelset, key_new.modelset)) seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key1.segset)) seg_index_b = numpy.argwhere(numpy.in1d(key1.segset, key_new.segset)) tar_1[model_index_a[:, None], seg_index_a] = key1.tar[model_index_b[:, None], seg_index_b] non_1[model_index_a[:, None], seg_index_a] = key1.non[model_index_b[:, None], seg_index_b] # expand ndx2 mask tar_2 = numpy.zeros((key_new.modelset.shape[0], key_new.segset.shape[0]), dtype="bool") non_2 = numpy.zeros((key_new.modelset.shape[0], key_new.segset.shape[0]), dtype="bool") model_index_a = numpy.argwhere(numpy.in1d(key_new.modelset, key2.modelset)) model_index_b = numpy.argwhere(numpy.in1d(key2.modelset, key_new.modelset)) seg_index_a = numpy.argwhere(numpy.in1d(key_new.segset, key2.segset)) seg_index_b = numpy.argwhere(numpy.in1d(key2.segset, key_new.segset)) tar_2[model_index_a[:, None], seg_index_a] = key2.tar[model_index_b[:, None], seg_index_b] non_2[model_index_a[:, None], seg_index_a] = key2.non[model_index_b[:, None], seg_index_b] # merge masks tar = tar_1 | tar_2 non = non_1 | non_2 # check for clashes assert numpy.sum(tar & non) == 0, "Conflict in the new Key" # build new key key_new.tar = tar key_new.non = non self.modelset = key_new.modelset self.segset = key_new.segset self.tar = key_new.tar self.non = key_new.non self.validate()