# Copyright (c) 2006, National ICT Australia
# All rights reserved.
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the 'License'); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an 'AS IS' basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# Authors: Le Song (lesong@it.usyd.edu.au)
# Created: (20/10/2006)
# Last Updated: (dd/mm/yyyy)
#

##\package elefant.fselection.hsic
# This module perform computation related to Hilber-Schmidt Independence
# Criterion. Hilber-Schmidt Independence Criterion is short for HSIC.
#
# HISC is defined as $HSIC=\frac{1}{m}Tr(KHLH)$, where $kMat$ and $lMat$
# are the kernel matrices for the data and the labels respectively.
# $H=I-\frac{1}{m}\delta_{ij}$, where $m$ is the number of data points,
# is the centering matrix. The unbiased estimator of HSIC is computed as
# $HSIC=\frac{1}{m(m-3)}\left[Tr(KL)+\frac{1}{(m-1)(m-2)}1^\top K11^\top L1
# -\frac{2}{m-2}1^\top KL1\right]. For more theorectical underpinning
# of HSIC, see the following reference:
#
# Gretton, A., O. Bousquet, A. Smola and B. Schoelkopf: Measuring
# Statistical Dependence with Hilbert-Schmidt Norms. Algorithmic
# Learning Theory: 16th International Conference, ALT 2005, 63-78, 2005.
# 

__version__ = '$Revision: $' 
# $Source$

import numpy
import vector
from setdiag0 import setdiag0

## Class that perform computation related to HSIC.
#
# It contains function that computes biased and unbiased HSIC, part of HSIC
# necessary for faster its faster computation, and functions that enable
# an optimization on HSIC with respect to the kernel parameters.
#
class CHSIC(object):
    def __init__(self):
        pass

    ## Compute HLH give the labels.
    # @param y The labels.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def ComputeHLH(self, y, kernely=vector.CLinearKernel()):
        ny = y.shape
        if len(ny) > 1:
            lMat = kernely.Dot(y, y)
        else:
            lMat = numpy.outerproduct(y, y)

        sL = numpy.sum(lMat, axis=1)
        ssL = numpy.sum(sL)
        # hlhMat
        return lMat - numpy.add.outer(sL, sL)/ny[0] + ssL/(ny[0]*ny[0])

    ## Compute the biased estimator of HSIC.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data, default to linear kernel.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def BiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
                   kernely=vector.CLinearKernel()):
        nx = x.shape
        ny = y.shape
        assert nx[0] == ny[0], \
               "Argument 1 and 2 have different number of data points"

        if len(nx) > 1:
            kMat = kernelx.Dot(x, x)
        else:
            kMat = numpy.outerproduct(x, x)

        hlhMat = ComputeHLH(y, kernely)
        return numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))

    ## Objective of the biased HSIC when performing optimization over
    # the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param hlhMat The HLH matrix on the labels.
    #
    def ObjBiasedHSIC(self, param, x, kernelx, hlhMat):
        nx = x.shape
        kMat = kernelx.DotCacheKernel(x, param)
        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))

    ## Gradient of the objective of the biased HSIC when performing
    # optimization over the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param hlhMat The HLH matrix on the labels.
    #
    def GradBiasedHISC(self, param, x, kernelx, hlhMat):
        nx = x.shape
        kMat = kernelx.GradDotCacheKernel(x, param)
        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))

    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data and the HLH matrix for the labels are already
    # computed.
    # @param kMat The kernel matrix for the data.
    # @param hlhMat The HLH matrix for the labels.
    #
    def BiasedHSICFast(self, kMat, hlhMat):
        nx = kMat.shape
        assert (kMat.shape == hlhMat.shape), \
               "Argument 1 and 2 have different shapes"

        return (kMat * hlhMat).sum() / ((nx[0]-1)*(nx[0]-1))

    ## Fast computation of the biased HSIC when the kernel matrix
    # for the labels can be decomposed into HLH = y * y' and the
    # rank of y is low
    # @param kMat The kernel matrix for the data.
    # @param y The HLH = y * y' for the labels.
    #
    def BiasedHSICFast2(self, kMat, y):
        nx = kMat.shape
        assert (kMat.shape[0] == y.shape[0]), \
               "Argument 1 and 2 have different shapes"

        return numpy.dot(y.T, numpy.dot(kMat, y)).trace() / ((nx[0]-1)*(nx[0]-1))

    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data K can be decomposed into K = x * x' and that 
    # for the labels can be decomposed into HLH = y * y' and the
    # rank of y is low (this will be useful after incomplete cholesky
    # factorization
    # @param x The K = x * x' for the data.
    # @param y The HLH = y * y' for the labels.
    #
    def BiasedHSICFast3(self, x, y):
        nx = x.shape
        assert (x.shape[0] == y.shape[0]), \
               "Argument 1 and 2 have different shapes"

        return (numpy.dot(x.T, y)**2).sum() / ((nx[0]-1)*(nx[0]-1))   
    

    ## Compute the UNbiased estimator of HSIC.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data, default to linear kernel.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def UnBiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
                     kernely=vector.CLinearKernel()):
        nx = x.shape
        ny = y.shape
        assert nx[0] == ny[0], \
               "Argument 1 and 2 have different number of data points"

        kMat = kernelx.Dot(x,x)
        setdiag0(kMat)

        lMat = kernely.Dot(y,y)
        setdiag0(lMat)

        sK = kMat.sum(axis=1)
        ssK = sK.sum()
        sL = lMat.sum(axis=1)
        ssL = sL.sum()

        return ( kMat.__imul__(lMat).sum() + \
                 (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) - \
                 2 * sK.__imul__(sL).sum() / (nx[0]-2) \
                 ) / (nx[0]*(nx[0]-3))

    ## Objective of the UNbiased HSIC when performing optimization over
    # the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def ObjUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
        nx = x.shape    
        kMat = kernelx.DotCacheKernel(x, param)
        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)

        return -( numpy.sum(numpy.sum(kMat*lMat)) \
                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                  ) / (nx[0]*(nx[0]-3))


    ## Gradient of the objective of the UNbiased HSIC when performing
    # optimization over the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def GradUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
        nx = x.shape
        kMat = kernelx.GradDotCacheKernel(x, param)
        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)

        return -( numpy.sum(numpy.sum(kMat*lMat)) \
                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                  ) / (nx[0]*(nx[0]-3))

    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data and the HLH matrix for the labels are already
    # computed.
    # @param kMat The kernel matrix for the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def UnBiasedHSICFast(self, kMat, lMat, sL, ssL):
        nx = kMat.shape
        assert (kMat.shape == lMat.shape), \
               "Argument 1 and 2 have different shapes"

        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)

        return ( numpy.sum(numpy.sum(kMat*lMat)) \
                 + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                 - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                 ) / (nx[0]*(nx[0]-3))

## Normalize each dimension of the data separately to zero mean and unit
# standard deviation.
# @param data [read\write] The data to be normalized. Each row is a
# datum and each column a dimension.
#
def normalize(data):
    m = data.mean(axis=0)
    s = data.std(axis=0)
    data.__isub__(m).__itruediv__(s)

## Center the kernel matrix in the feature space.
# @param k [read\write] The kernel matrix to be centered. 
#
def center(k):
    n = k.shape
    assert n[0] == n[1], 'k must be symmetric and positive semidefinite'    
    mk = k.mean(axis=1)
    mk.shape = (n[0], 1)
    mmk = mk.mean()
    k.__isub__(mk).__isub__(mk.T).__iadd__(mmk)