# -*- coding: utf-8 -*-
# This package is a translation of a part of the BOSARIS toolkit.
# The authors thank Niko Brummer and Agnitio for allowing them to
# translate this code and provide the community with efficient structures
# and tools.
#
# The BOSARIS Toolkit is a collection of functions and classes in Matlab
# that can be used to calibrate, fuse and plot scores from speaker recognition
# (or other fields in which scores are used to test the hypothesis that two
# samples are from the same source) trials involving a model and a test segment.
# The toolkit was written at the BOSARIS2010 workshop which took place at the
# University of Technology in Brno, Czech Republic from 5 July to 6 August 2010.
# See the User Guide (available on the toolkit website)1 for a discussion of the
# theory behind the toolkit and descriptions of some of the algorithms used.
#
# The BOSARIS toolkit in MATLAB can be downloaded from `the website
# <https://sites.google.com/site/bosaristoolkit/>`_.
"""
This is the 'ndx' module
"""
import h5py
import logging
import numpy
import sys
from ..sidekit_wrappers import check_path_existance, deprecated
__author__ = "Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'
__credits__ = ["Niko Brummer", "Edward de Villiers"]
def diff(list1, list2):
c = [item for item in list1 if item not in list2]
c.sort()
return c
def ismember(list1, list2):
c = [item in list2 for item in list1]
return c
[docs]class Ndx:
"""A class that encodes trial index information. It has a list of
model names and a list of test segment names and a matrix
indicating which combinations of model and test segment are
trials of interest.
:attr modelset: list of unique models in a ndarray
:attr segset: list of unique test segments in a ndarray
:attr trialmask: 2D ndarray of boolean. Rows correspond to the models
and columns to the test segments. True if the trial is of interest.
"""
def __init__(self, ndx_file_name='',
models=numpy.array([]),
testsegs=numpy.array([])):
"""Initialize a Ndx object by loading information from a file
in HDF5 or text format.
:param ndx_file_name: name of the file to load
"""
self.modelset = numpy.empty(0, dtype="|O")
self.segset = numpy.empty(0, dtype="|O")
self.trialmask = numpy.array([], dtype="bool")
if ndx_file_name == '':
modelset = numpy.unique(models)
segset = numpy.unique(testsegs)
trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
for m in range(modelset.shape[0]):
segs = testsegs[numpy.array(ismember(models, modelset[m]))]
trialmask[m, ] = ismember(segset, segs)
self.modelset = modelset
self.segset = segset
self.trialmask = trialmask
assert self.validate(), "Wrong Ndx format"
else:
ndx = Ndx.read(ndx_file_name)
self.modelset = ndx.modelset
self.segset = ndx.segset
self.trialmask = ndx.trialmask
@check_path_existance
def write(self, output_file_name):
""" Save Ndx object in HDF5 format
:param output_file_name: name of the file to write to
"""
assert self.validate(), "Error: wrong Ndx format"
with h5py.File(output_file_name, "w") as f:
f.create_dataset("modelset", data=self.modelset.astype('S'),
maxshape=(None,),
compression="gzip",
fletcher32=True)
f.create_dataset("segset", data=self.segset.astype('S'),
maxshape=(None,),
compression="gzip",
fletcher32=True)
f.create_dataset("trial_mask", data=self.trialmask.astype('int8'),
maxshape=(None, None),
compression="gzip",
fletcher32=True)
@check_path_existance
def save_txt(self, output_file_name):
"""Save a Ndx object in a text file
:param output_file_name: name of the file to write to
"""
fid = open(output_file_name, 'w')
for m in range(self.modelset.shape[0]):
segs = self.segset[self.trialmask[m, ]]
for s in segs:
fid.write('{} {}\n'.format(self.modelset[m], s))
fid.close()
[docs] def filter(self, modlist, seglist, keep):
"""Removes some of the information in an Ndx. Useful for creating a
gender specific Ndx from a pooled gender Ndx. Depending on the
value of \'keep\', the two input lists indicate the strings to
retain or the strings to discard.
:param modlist: a cell array of strings which will be compared with
the modelset of 'inndx'.
:param seglist: a cell array of strings which will be compared with
the segset of 'inndx'.
:param keep: a boolean indicating whether modlist and seglist are the
models to keep or discard.
:return: a filtered version of the current Ndx object.
"""
if keep:
keepmods = modlist
keepsegs = seglist
else:
keepmods = diff(self.modelset, modlist)
keepsegs = diff(self.segset, seglist)
keepmodidx = numpy.array(ismember(self.modelset, keepmods))
keepsegidx = numpy.array(ismember(self.segset, keepsegs))
outndx = Ndx()
outndx.modelset = self.modelset[keepmodidx]
outndx.segset = self.segset[keepsegidx]
tmp = self.trialmask[numpy.array(keepmodidx), :]
outndx.trialmask = tmp[:, numpy.array(keepsegidx)]
assert outndx.validate, "Wrong Ndx format"
if self.modelset.shape[0] > outndx.modelset.shape[0]:
logging.info('Number of models reduced from %d to %d', self.modelset.shape[0], outndx.modelset.shape[0])
if self.segset.shape[0] > outndx.segset.shape[0]:
logging.info('Number of test segments reduced from %d to %d', self.segset.shape[0], outndx.segset.shape[0])
return outndx
[docs] def validate(self):
"""Checks that an object of type Ndx obeys certain rules that
must always be true.
:return: a boolean value indicating whether the object is valid
"""
ok = isinstance(self.modelset, numpy.ndarray)
ok &= isinstance(self.segset, numpy.ndarray)
ok &= isinstance(self.trialmask, numpy.ndarray)
ok &= (self.modelset.ndim == 1)
ok &= (self.segset.ndim == 1)
ok &= (self.trialmask.ndim == 2)
ok &= (self.trialmask.shape == (self.modelset.shape[0], self.segset.shape[0]))
return ok
[docs] @staticmethod
def read(input_file_name):
"""Creates an Ndx object from the information in an hdf5 file.
:param input_file_name: name of the file to read from
"""
with h5py.File(input_file_name, "r") as f:
ndx = Ndx()
ndx.modelset = f.get("modelset")[()]
ndx.segset = f.get("segset")[()]
# if running python 3, need a conversion to unicode
if sys.version_info[0] == 3:
ndx.modelset = ndx.modelset.astype('U100', copy=False)
ndx.segset = ndx.segset.astype('U100', copy=False)
ndx.trialmask = f.get("trial_mask")[()].astype('bool')
assert ndx.validate(), "Error: wrong Ndx format"
return ndx
@classmethod
@check_path_existance
def read_txt(cls, input_filename):
"""Creates an Ndx object from information stored in a text file.
:param input_filename: name of the file to read from
"""
ndx = Ndx()
with open(input_filename, 'r') as fid:
lines = [l.rstrip().split() for l in fid]
models = numpy.empty(len(lines), '|O')
testsegs = numpy.empty(len(lines), '|O')
for ii in range(len(lines)):
models[ii] = lines[ii][0]
testsegs[ii] = lines[ii][1]
modelset = numpy.unique(models)
segset = numpy.unique(testsegs)
trialmask = numpy.zeros((modelset.shape[0], segset.shape[0]), dtype="bool")
for m in range(modelset.shape[0]):
segs = testsegs[numpy.array(ismember(models, modelset[m]))]
trialmask[m, ] = ismember(segset, segs)
ndx.modelset = modelset
ndx.segset = segset
ndx.trialmask = trialmask
assert ndx.validate(), "Wrong Ndx format"
return ndx
[docs] def merge(self, ndx_list):
"""Merges a list of Ndx objects into the current one.
The resulting ndx must have all models and segment in the input
ndxs (only once). A trial in any ndx becomes a trial in the
output ndx
:param ndx_list: list of Ndx objects to merge
"""
assert isinstance(ndx_list, list), "Input is not a list"
for ndx in ndx_list:
assert isinstance(ndx_list, list), \
'{} {} {}'.format("Element ", ndx, " is not an Ndx")
self.validate()
for ndx2 in ndx_list:
ndx_new = Ndx()
ndx1 = self
# create new ndx with empty masks
ndx_new.modelset = numpy.union1d(ndx1.modelset, ndx2.modelset)
ndx_new.segset = numpy.union1d(ndx1.segset, ndx2.segset)
# expand ndx1 mask
trials_1 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx1.modelset))
model_index_b = numpy.argwhere(numpy.in1d(ndx1.modelset, ndx_new.modelset))
seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx1.segset))
seg_index_b = numpy.argwhere(numpy.in1d(ndx1.segset, ndx_new.segset))
trials_1[model_index_a[:, None], seg_index_a] = ndx1.trialmask[model_index_b[:, None], seg_index_b]
# expand ndx2 mask
trials_2 = numpy.zeros((ndx_new.modelset.shape[0], ndx_new.segset.shape[0]), dtype="bool")
model_index_a = numpy.argwhere(numpy.in1d(ndx_new.modelset, ndx2.modelset))
model_index_b = numpy.argwhere(numpy.in1d(ndx2.modelset, ndx_new.modelset))
seg_index_a = numpy.argwhere(numpy.in1d(ndx_new.segset, ndx2.segset))
seg_index_b = numpy.argwhere(numpy.in1d(ndx2.segset, ndx_new.segset))
trials_2[model_index_a[:, None], seg_index_a] = ndx2.trialmask[model_index_b[:, None], seg_index_b]
# merge masks
trials = trials_1 | trials_2
# build new ndx
ndx_new.trialmask = trials
self.modelset = ndx_new.modelset
self.segset = ndx_new.segset
self.trialmask = ndx_new.trialmask