Source code for svm_training

# -*- coding: utf-8 -*-
#
# This file is part of SIDEKIT.
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#
# SIDEKIT is a python package for speaker verification.
# Home page: http://www-lium.univ-lemans.fr/sidekit/
#    
# SIDEKIT is free software: you can redistribute it and/or modify
# it under the terms of the GNU LLesser General Public License as 
# published by the Free Software Foundation, either version 3 of the License, 
# or (at your option) any later version.
#
# SIDEKIT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with SIDEKIT.  If not, see <http://www.gnu.org/licenses/>.

"""
Copyright 2014-2019 Anthony Larcher

:mod:`svm_training` provides utilities to train Support Vector Machines
to perform speaker verification.
"""
import numpy
import os
import logging
from sidekit.libsvm.svmutil import svm_problem, svm_parameter, svm_train
from sidekit.statserver import StatServer
import multiprocessing
import sidekit.sv_utils


__license__ = "LGPL"
__author__ = "Anthony Larcher"
__copyright__ = "Copyright 2014-2019 Anthony Larcher"
__maintainer__ = "Anthony Larcher"
__email__ = "anthony.larcher@univ-lemans.fr"
__status__ = "Production"
__docformat__ = 'reStructuredText'


[docs]def svm_training_singleThread(K, msn, bsn, svm_dir, background_sv, models, enroll_sv):
    """Train Suport Vector Machine classifiers for two classes task 
    (as implemented for nowbut miht change in the future to include multi-class
    classification)
    
    :param K: pre-computed part of the Gram matrix
    :param msn: maximum number of sessions to train a SVM
    :param bsn: number of session used as background impostors
    :param svm_dir: directory where to store the SVM models
    :param background_sv: StatServer of super-vectors for background impostors. All
          super-vectors are used without selection
    :param models: list of models to train. The models must be included in the 
          enroll_sv StatServer
    :param enroll_sv: StatServer of super-vectors used for the target models
    """
    gram = numpy.zeros((bsn + msn, bsn + msn))
    gram[:bsn, :bsn] = K
    # labels of the target examples are set to 1
    # labels of the impostor vectors are set to 2
    K_label = (2 * numpy.ones(bsn, 'int')).tolist() + numpy.ones(msn, 'int').tolist()

    for model in models:
        logging.info('Train SVM model for %s', model)    
        # Compute the part of the Kernel which depends on the enrollment data
        csn = enroll_sv.get_model_segments(model).shape[0]
        X = numpy.vstack((background_sv.stat1, enroll_sv.get_model_stat1(model)))
        gram[:bsn + csn, bsn:bsn + csn] = numpy.dot(X, enroll_sv.get_model_stat1(model).transpose())
        gram[bsn:bsn + csn, :bsn] = gram[:bsn, bsn:bsn + csn].transpose()

        # train the SVM for the current model (where libsvm is used)
        Kernel = numpy.zeros((gram.shape[0], gram.shape[1] + 1)).tolist()
        for i in range(gram.shape[0]):
            Kernel[i][0] = int(i + 1)
            Kernel[i][1:] = gram[i, ]

        # isKernel=True must be set for precomputer kernel
        # Precomputed kernel data (-t 4)
        prob = svm_problem(K_label, Kernel, isKernel=True)
        c = 1 / numpy.mean(numpy.diag(gram))
        param = svm_parameter('-t 4 -c {}'.format(c))
        svm = svm_train(prob, param)
        # Compute the weights
        w = -numpy.dot(X[numpy.array(svm.get_sv_indices()) - 1, ].transpose(), numpy.array(svm.get_sv_coef()))
        bsvm = svm.rho[0]
        svmFileName = os.path.join(svm_dir, model + '.svm')
        sidekit.sv_utils.save_svm(svmFileName, w, bsvm)


[docs]def svm_training(svmDir, background_sv, enroll_sv, num_thread=1):
    """Train Suport Vector Machine classifiers for two classes task 
    (as implemented for nowbut miht change in the future to include multi-class
    classification)
    Training is parallelized on multiple threads.
    
    :param svmDir: directory where to store the SVM models
    :param background_sv: StatServer of super-vectors for background impostors. All
          super-vectors are used without selection
    :param enroll_sv: StatServer of super-vectors used for the target models
    :param num_thread: number of thread to launch in parallel
    """
    assert isinstance(background_sv, StatServer), 'Second parameter has to be a StatServer'
    assert isinstance(enroll_sv, StatServer), 'Third parameter has to be a StatServer'

    # The effective Kernel is initialize for the case of multi-session
    # by considering the maximum number of sessions per speaker.
    # For the SVM training, only a subpart of the kernel is used, accordingly
    # to the number of sessions of the current speaker
    K = background_sv.precompute_svm_kernel_stat1()
    msn = max([enroll_sv.modelset.tolist().count(a) for a in enroll_sv.modelset.tolist()])
    bsn = K.shape[0]

    # Split the list of unique model names
    listOfModels = numpy.array_split(numpy.unique(enroll_sv.modelset), num_thread)
    
    # Process each sub-list of models in a separate thread
    jobs = []
    for idx, models in enumerate(listOfModels):
        p = multiprocessing.Process(target=svm_training_singleThread,
                                    args=(K, msn, bsn, svmDir, background_sv, models, enroll_sv))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()