Initial commit

2014-11-21 18:52:56 +05:30
commit 3fd37a416f
@@ -0,0 +1,48 @@
 dump/
 *.dat
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 build/
 develop-eggs/
 dist/
 eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 #  Usually these files are written by a python script from a template 
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .cache
 nosetests.xml
 coverage.xml
 # PyBuilder
 target/
@@ -0,0 +1,22 @@
 The MIT License (MIT)
 Copyright (c) 2014 Rishi Dua <rishirdua@gmail.com>, TV Ashok <veeranjaneyaashok@gmail.com>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
@@ -0,0 +1,28 @@
 EMOTION CLASSIFICATION
 ======================
 Feature selection via dependence maximization for EEG based emotion classification. This project has been done as a part of Neural Networks course for the fall 2014 Semester at IIT Delhi.
 Supervisor: Dr. Jayadeva
 Authors
 -------
 Rishi Dua <http://github.com/rishirdua>
 TV Ashok <http://github.com/tvashok>
 Install
 -------
 1. Install Python and scikit
 2. Copy the DEAP dataset (cPickle preprocessed) to data/raw folder
 3. Run script.sh
 Documentation
 -------------
 Refer docs/readme.pdf
 Contribute
 ----------
 - Source Code: https://github.com/rishirdua/emotion-classification/
 License
 -------
 This project is licensed under the terms of the MIT license. See LCENCE.txt for details
@@ -0,0 +1,190 @@
 # Copyright (c) 2006, National ICT Australia
 # All rights reserved.
 #
 # The contents of this file are subject to the Mozilla Public License Version
 # 1.1 (the 'License'); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 # http://www.mozilla.org/MPL/
 #
 # Software distributed under the License is distributed on an 'AS IS' basis,
 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 # for the specific language governing rights and limitations under the
 # License.
 #
 # Authors: Le Song (lesong@it.usyd.edu.au)
 # Created: (20/10/2006)
 # Last Updated: (dd/mm/yyyy)
 #
 ##\package elefant.fselection.bahsic
 # This module perform backward elimination for feature selection
 # using HSIC (BAHSIC).
 #
 # The algorithm proceeds recursively, eliminating the least
 # relevant features and adding them to the eliminated list
 # in each iteration. For more theoretical underpinning see the
 # following reference for more information:
 #
 # Le Song, Justin Bedo, Karsten M. Borgwardt, Arthur Gretton
 # and Alex Smola. The BAHSIC family of gene selection algorithms.
 #
 __version__ = '$Revision: $' 
 # $Source$
 import numpy
 from scipy import optimize
 import vector
 from hsic import CHSIC
 from setdiag0 import setdiag0
 ## Class that perform backward elimination for feature selection (BAHSIC).
 #
 # It has two version of BAHSIC: one without optimization over the kernel
 # parameters and one with optimization over the kernel parameters.
 #
 class CBAHSIC(object):
    def __init__(self):
        pass
    ## BAHSIC with optimization over the kernel parameters.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data.
    # @param kernely The kernel on the labels.
    # @param flg3 The number of desired features.
    # @param flg4 The proportion of features eleminated in each iteration.
    #
    def BAHSICOpt(self, x, y, kernelx, kernely, flg3, flg4):
        assert len(x.shape) == 2, 'Argument 1 has wrong shape'
        assert len(y.shape) == 2, 'Argument 2 has wrong shape'
        assert x.shape[0] == y.shape[0], \
               'Argument 1 and 2 have different number of data points'
        print '--initializing...'
        hsic = CHSIC()
        L = kernely.Dot(y, y)
        setdiag0(L)
        sL = numpy.sum(L, axis=1)
        ssL = numpy.sum(sL)
        n = x.shape
        eliminatedI = []
        selectedI = set(numpy.arange(n[1]))
        kernelx.CreateCacheKernel(x)
        sga = kernelx._typicalParam
        sgaN = sga.shape
        sgaN = sgaN[0]
        while True:        
            selectedI = selectedI - set(eliminatedI)
            sI = numpy.array([j for j in selectedI])
            m = len(sI)
            print m
            if (m == 1):
                eliminatedI.append(selectedI.pop())
                break
            sgaMat = []
            hsicMat = []
            for k in range(sgaN):
                ## bfgs in scipy is not working here
                retval = optimize.fmin_cg(hsic.ObjUnBiasedHSIC, \
                                          sga[[k],].ravel(), \
                                          hsic.GradUnBiasedHSIC,\
                                          args=[x, kernelx, L, sL, ssL], \
                                          gtol=1e-6, maxiter=100, \
                                          full_output=True, disp=False)
                sgaMat.append(retval[0])
                hsicMat.append(retval[1])
            k = numpy.argmin(hsicMat)
            sga0 = sgaMat[k]
            objj = []
            for j in selectedI:
                K = kernelx.DecDotCacheKernel(x, x[:,[j]], sga0)
                setdiag0(K)
                objj.append(hsic.UnBiasedHSICFast(K, L, sL, ssL))
            if m > flg3:
                maxj = numpy.argsort(objj)
                num = int(flg4 * m)+1
                if m - num <= flg3:
                    num = m - flg3
                maxj = maxj[m:m-num-1:-1]
            else:
                maxj = numpy.array([numpy.argmax(objj)])
            j = numpy.take(sI,maxj)
            eliminatedI.extend(j)
            kernelx.DecCacheKernel(x, x[:,j])
        kernelx.ClearCacheKernel(x)
        return eliminatedI
    ## BAHSIC without optimization over the kernel parameters.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data.
    # @param kernely The kernel on the labels.
    # @param flg3 The number of desired features.
    # @param flg4 The proportion of features eleminated in each iteration.
    #
    def BAHSICRaw(self, x, y, kernelx, kernely, flg3, flg4):
        assert len(x.shape) == 2, 'Argument 1 has wrong shape'
        assert len(y.shape) == 2, 'Argument 2 has wrong shape'
        assert x.shape[0] == y.shape[0], \
               'Argument 1 and 2 have different number of data points'       
        print '--initializing...'
        hsic = CHSIC()
        L = kernely.Dot(y, y)
        setdiag0(L)
        sL = numpy.sum(L, axis=1)
        ssL = numpy.sum(sL)
        n = x.shape
        eliminatedI = []
        selectedI = set(numpy.arange(n[1]))
        kernelx.CreateCacheKernel(x)
        while True:
            selectedI = selectedI - set(eliminatedI)
            sI = numpy.array([j for j in selectedI])
            m = len(sI)
            print m
            if (m == 1):
                eliminatedI.append(selectedI.pop())
                break
            objj = []
            for j in selectedI:
                K = kernelx.DecDotCacheKernel(x, x[:,[j]])
                setdiag0(K)
                objj.append(hsic.UnBiasedHSICFast(K, L, sL, ssL))
            if m > flg3:
                maxj = numpy.argsort(objj)
                num = int(flg4 * m)+1
                if m-num <= flg3:
                    num = m - flg3
                maxj = maxj[m:m-num-1:-1]
            else:
                maxj = numpy.array([numpy.argmax(objj)])
            j = numpy.take(sI,maxj)
            eliminatedI.extend(j)
            kernelx.DecCacheKernel(x, x[:,j])
        kernelx.ClearCacheKernel(x)
        return eliminatedI
@@ -0,0 +1,46 @@
 import cPickle
 import os.path
 from multiprocessing import Pool
 import sys
 def main():
 	nLabel, nTrial, nUser, nChannel, nTime  = 4, 40, 32, 40, 8064
 	#new_array = [[[None] *w for i in range(h)] for j in range(l)]
 	print "Program started"+"\n"
 	fout_data = open("data/features_raw.dat",'w')
 	fout_labels0 = open("data/labels_0.dat",'w')
 	fout_labels1 = open("data/labels_1.dat",'w')
 	fout_labels2 = open("data/labels_2.dat",'w')
 	fout_labels3 = open("data/labels_3.dat",'w')
 	for i in range(nUser):#4, 40, 32, 40, 8064
 		if(i%8 == 0):
 			if i < 10:
 				name = '%0*d' % (2,i+1)
 			else:
 				name = i+1
 			fname = "data/raw/s"+str(name)+".dat"
 			x = cPickle.load(open(fname, 'rb'))
 			print fname
 			for tr in range(nTrial):
 				if(tr%1 == 0):
 					for dat in range(nTime):
 						if(dat%32 == 0):
 							for ch in range(nChannel):
 								#fout_data.write(str(ch+1) + " ");
 								fout_data.write(str(x['data'][tr][ch][dat]) + " ");
 					fout_labels0.write(str(x['labels'][tr][0]) + "\n");
 					fout_labels1.write(str(x['labels'][tr][1]) + "\n");
 					fout_labels2.write(str(x['labels'][tr][2]) + "\n");
 					fout_labels3.write(str(x['labels'][tr][3]) + "\n");
 					fout_data.write("\n");
 	fout_labels0.close()
 	fout_labels1.close()
 	fout_labels2.close()
 	fout_labels3.close()
 	fout_data.close()
 	print "\n"+"Print Successful"
 if __name__ == "__main__":
 	main()
@@ -0,0 +1,56 @@
 import time
 import sys
 import numpy
 import vector
 from bahsic import CBAHSIC
 usage = "yolo"
 if __name__ == "__main__":
 	if (len(sys.argv)<4):
 		print usage
 	else:
 		file_x = sys.argv[1];
 		file_y = sys.argv[2];
 		file_out = sys.argv[3];
 		if (sys.argv==5):
 			file_normalized = sys.argv[5]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		bahsic = CBAHSIC()
 		data_no = 160
 		features_tokeep = 5040
 		y.shape = (data_no,1)
 		# Normalize the labels.
 		y = 1.0*y
 		tmp_no = numpy.sum(y)
 		pno = (data_no + tmp_no) / 2
 		nno = (data_no - tmp_no) / 2
 		y[y>0] = y[y>0]/pno
 		y[y<0] = y[y<0]/nno
 		# Normalize the data. 
 		m = X.mean(0)
 		s = X.std(0)
 		X.__isub__(m).__idiv__(s)
 		t1 = time.clock()
 		tmp = bahsic.BAHSICRaw(X, y, vector.CLinearKernel(), vector.CLinearKernel(), features_tokeep, 0.1)
 		t2 = time.clock()
 		print "time taken: "+str(t2-t1)
 		print '--rank of the features'
 		print '--better features towards the end of the list:'
 		print tmp
 		hsicfeatures= numpy.zeros(shape=(data_no,features_tokeep))
 		for i in range(0,data_no):
 			for j in range(0,features_tokeep):
 				hsicfeatures[i][j] = X[i][tmp[features_tokeep+j]]
 		numpy.savetxt(file_out, hsicfeatures)
 		if (sys.argv==5):
 			numpy.savetxt('original.csv', X)
@@ -0,0 +1,39 @@
 %% DESCRIPTION
 clc;
 clear all;
 % Transforms features into Recht and Rahimi’s Random Fourier Feature as defined in:
 % Rahimi, Ali, and Benjamin Recht. "Random features for large-scale kernel machines." In Advances in neural information processing systems, pp. 1177-1184. 2007.
 X = dlmread('data/features_raw.dat', ' ');
 n_features = size(X,2);
 n_data = size(X,1);
 gamma_inv = 0.1;
 gamma = 1/gamma_inv;
 sigma = sqrt(2/gamma_inv);
 n_randomfeatures = 5040;
 %calculate
 W=normrnd(0,sigma,n_features,n_randomfeatures);
 b=2*pi*rand(1,n_randomfeatures);
 B = ones(n_data,1)*(b);
 X_rrt = sqrt(2/n_randomfeatures)*cos(X*W+B);
 disp('calculated');
 %normalize
 %mean_tr = mean(Data_new);
 %std_tr = std(Data_new);
 %Data_new = (Data_new-repmat(mean_tr,n_data,1))./(repmat(std_tr,n_data,1));
 %disp('normalized');
 %toc;
 %write
 dlmwrite('data/features_rrt.dat',X_rrt, ' ');
 disp('writen to file');
@@ -0,0 +1,36 @@
 import cPickle
 import os.path
 from multiprocessing import Pool
 import sys
 usage = "filename out_file"
 def generate_features(fout_file):
 	nLabel, nTrial, nUser, nChannel, nTime  = 4, 40, 32, 40, 8064
 	#new_array = [[[None] *w for i in range(h)] for j in range(l)]
 	print "Program started"+"\n"
 	fout_data = open(fout_file,'w')
 	for i in range(nUser):#4, 40, 32, 40, 8064
 		if(i%8 == 0):
 			if i < 10:
 				name = '%0*d' % (2,i+1)
 			else:
 				name = i+1
 			fname = "data/raw/s"+str(name)+".dat"
 			x = cPickle.load(open(fname, 'rb'))
 			print fname
 			for tr in range(nTrial):
 				if(tr%1 == 0):
 					for dat in range(nTime):
 						if(dat%64 == 0):
 							for ch in range(nChannel):
 								#fout_data.write(str(ch+1) + " ");
 								fout_data.write(str(x['data'][tr][ch][dat]) + " ");
 					fout_data.write("\n");
 	fout_data.close()
 if __name__ == "__main__":
 	if (len(sys.argv)!=2):
 		print usage
 	else:
 		generate_features(sys.argv[1])
@@ -0,0 +1,168 @@
 # Copyright (c) 2006, National ICT Australia
 # All rights reserved.
 #
 # The contents of this file are subject to the Mozilla Public License Version
 # 1.1 (the "License"); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 # http://www.mozilla.org/MPL/
 #
 # Software distributed under the License is distributed on an "AS IS" basis,
 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 # for the specific language governing rights and limitations under the
 # License.
 #
 # Authors: Le Song (lesong@it.usyd.edu.au) and Alex Smola
 # (alex.smola@nicta.com.au)
 # Created: (20/10/2006)
 # Last Updated: (dd/mm/yyyy)
 #
 ##\package elefant.kernels.generic
 # This module contains generic class for kernels
 #
 # The CKernel class provides common interface for all kernel classes. Note
 # that it should never be instantiated.
 #
 __version__ = "$Revision: $" 
 # $Source$ 
 import numpy
 import numpy.random as random
 ## Generic kernel class
 #
 # This kernel provide common interface for all kernels. This interface
 # includes the following key kernel manipulations (functions):
 # --Dot(x1, x2): $K(x1, x2)$
 # --Expand(x1, x2, alpha): $sum_r K(x1_i,x2_r) \times alpha2_r$
 # --Tensor(x1, y1, x2, y2): $K(x1_i,x2_j) \times (y1_i \times y1_j)$
 # --TensorExpand(x1, y1, x2, y2, alpha2):
 # $sum_r K(x1_i,x2_r) \times (y1_i \times y1_r) \times alpha2_r$
 # --Remember(x): Remember data x
 # --Forget(x): Remove remembered data x
 # To design a specific kernel, simply overload these methods. The generic
 # kernel itself should never be instantiated. 
 #
 class CKernel(object):
    def __init__(self, blocksize=128):
        ## @var _blocksize
        # Parameter that determines the size of each block when computing the
        # kernel matrix in blocks. Properly blocking the kernel matrix during
        # computation improves the speed.
        #
        self._blocksize = blocksize
        ## @var _name
        # Name of the kernel.
        #
        self._name = "Generic kernel"
        ## @var _cacheData
        # Cache that stores data that have appeared before.
        #
        self._cacheData = {}
    def __str__(self):
        return self._name
    def __repr__(self):
        return "Kernel object of type '" + self._name + "'"
    ## Compute the kernel between two data points x1 and x2.
    # It returns a scale value of dot product between x1 and x2.
    # @param x1 [read] The first data point.
    # @param x2 [read] The second data point.
    #
    def K(self, x1, x2):
        raise NotImplementedError, \
              'CKernel.K in abstract class is not implemented'
    ## Compute the kernel between the data points in x1 and those in x2.
    # It returns a matrix with entry $(ij)$ equal to $K(x1_i, x1_j)$.
    # If index1/index2 is
    # specified, only those data points in x1/x2 with indices corresponding
    # to index1/index2 are used to compute the kernel matrix. Furthermore,
    # if output is specified, the provided buffer is used explicitly to
    # store the kernel matrix.
    # @param x1 [read] The first set of data points.
    # @param x2 [read] The second set of data points.
    # @param index1 [read] The indices into the first set of data points. 
    # @param index2 [read] The indices into the second set of data points.
    # @param output [write] The buffer where the output matrix is written into.
    #
    def Dot(self, x1, x2, index1=None, index2=None, output=None):
        raise NotImplementedError, \
              'CKernel.Dot in abstract class is not implemented' 
    ## Compute the kernel between the data points in x1 and those in x2,
    # then multiply the resulting kernel matrix by alpha2.
    # It returns a matrix with entry $(ij)$ equal to
    # $sum_r K(x1_i,x2_r) \times alpha2_r$.
    # Other parameters are defined similarly as those in Dot. 
    # @param x1 [read] The first set of data points.
    # @param x2 [read] The second set of data points.
    # @param alpha2 [read] The set of coefficients.
    # @param index1 [read] The indices into the first set of data points. 
    # @param index2 [read] The indices into the second set of data points.
    # @param output [write] The buffer where the output matrix is written into.
    #
    def Expand(self, x1, x2, alpha2, index1=None, index2=None, output=None):
        raise NotImplementedError, \
              'CKernel.Expand in abstract class is not implemented' 
    ## Compute the kernel between the data points in x1 and those in x2,
    # then multiply the resulting kernel matrix elementwiesely by the
    # the outer-product matrix between y1 and y2. It returns a matrix
    # with entry $(ij)$ equal to $K(x1_i,x2_j) \times (y1_i \times y1_j)$.
    # Other parameters are defined similarly as those in Dot. 
    # @param x1 [read] The first set of data points.
    # @param y1 [read] The first set of labels.
    # @param x2 [read] The second set of data points.
    # @param y2 [read] The second set of labels.
    # @param index1 [read] The indices into the first set of data points. 
    # @param index2 [read] The indices into the second set of data points.
    # @param output [write] The buffer where the output matrix is written into.
    #
    def Tensor(self, x1, y1, x2, y2, index1=None, index2=None, output=None):
        raise NotImplementedError, \
              'CKernel.Tensor in abstract class is not implemented' 
    ## Compute the kernel between the data points in x1 and those in x2,
    # then multiply the resulting kernel matrix elementwiesely by the
    # the outer-product matrix between y1 and y2, and final multiply
    # the resulting matrix by alpha2. It returns a matrix with entry $(ij)$
    # equal to $sum_r K(x1_i,x2_r) \times (y1_i \times y1_r) \times alpha2_r$.
    # Other parameters are defined similarly as those in Dot. 
    # @param x1 [read] The first set of data points.
    # @param y1 [read] The first set of labels.
    # @param x2 [read] The second set of data points.
    # @param y2 [read] The second set of labels.
    # @param index1 [read] The indices into the first set of data points. 
    # @param index2 [read] The indices into the second set of data points.
    # @param output [write] The buffer where the output matrix is written into.
    #
    def TensorExpand(self, x1, y1, x2, y2, alpha2, index1=None, index2=None, \
                     output=None):
        raise NotImplementedError, \
              'CKernel.TensorExpand in abstract class is not implemented'
    ## Remember the data by performing necessary preprossing on
    # the data, storing it in the cache and indexing it by the id of
    # the data. The preprocessing can be defined differently for
    # different classes. If the data have already been remembered,
    # the old stored information is simply overwritten.
    # @param x [read] The data to be remembered.
    #
    def Remember(self, x):
        raise NotImplementedError, \
              'CKernel.Remember in abstract class is not implemented' 
    ## Remove a remembered data from the cache. If x is not given, then
    # all the data remembered in the cache  will be removed. If a given
    # x is not remembered beforehand, False is returned; otherwise, True
    # is returned. 
    # @param x [read] The data to be removed.
    #
    def Forget(self, x=None):
        raise NotImplementedError, \
              'CKernel.Forget in abstract class is not implemented' 
@@ -0,0 +1,267 @@
 # Copyright (c) 2006, National ICT Australia
 # All rights reserved.
 #
 # The contents of this file are subject to the Mozilla Public License Version
 # 1.1 (the 'License'); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 # http://www.mozilla.org/MPL/
 #
 # Software distributed under the License is distributed on an 'AS IS' basis,
 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 # for the specific language governing rights and limitations under the
 # License.
 #
 # Authors: Le Song (lesong@it.usyd.edu.au)
 # Created: (20/10/2006)
 # Last Updated: (dd/mm/yyyy)
 #
 ##\package elefant.fselection.hsic
 # This module perform computation related to Hilber-Schmidt Independence
 # Criterion. Hilber-Schmidt Independence Criterion is short for HSIC.
 #
 # HISC is defined as $HSIC=\frac{1}{m}Tr(KHLH)$, where $kMat$ and $lMat$
 # are the kernel matrices for the data and the labels respectively.
 # $H=I-\frac{1}{m}\delta_{ij}$, where $m$ is the number of data points,
 # is the centering matrix. The unbiased estimator of HSIC is computed as
 # $HSIC=\frac{1}{m(m-3)}\left[Tr(KL)+\frac{1}{(m-1)(m-2)}1^\top K11^\top L1
 # -\frac{2}{m-2}1^\top KL1\right]. For more theorectical underpinning
 # of HSIC, see the following reference:
 #
 # Gretton, A., O. Bousquet, A. Smola and B. Schoelkopf: Measuring
 # Statistical Dependence with Hilbert-Schmidt Norms. Algorithmic
 # Learning Theory: 16th International Conference, ALT 2005, 63-78, 2005.
 # 
 __version__ = '$Revision: $' 
 # $Source$
 import numpy
 import vector
 from setdiag0 import setdiag0
 ## Class that perform computation related to HSIC.
 #
 # It contains function that computes biased and unbiased HSIC, part of HSIC
 # necessary for faster its faster computation, and functions that enable
 # an optimization on HSIC with respect to the kernel parameters.
 #
 class CHSIC(object):
    def __init__(self):
        pass
    ## Compute HLH give the labels.
    # @param y The labels.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def ComputeHLH(self, y, kernely=vector.CLinearKernel()):
        ny = y.shape
        if len(ny) > 1:
            lMat = kernely.Dot(y, y)
        else:
            lMat = numpy.outerproduct(y, y)
        sL = numpy.sum(lMat, axis=1)
        ssL = numpy.sum(sL)
        # hlhMat
        return lMat - numpy.add.outer(sL, sL)/ny[0] + ssL/(ny[0]*ny[0])
    ## Compute the biased estimator of HSIC.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data, default to linear kernel.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def BiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
                   kernely=vector.CLinearKernel()):
        nx = x.shape
        ny = y.shape
        assert nx[0] == ny[0], \
               "Argument 1 and 2 have different number of data points"
        if len(nx) > 1:
            kMat = kernelx.Dot(x, x)
        else:
            kMat = numpy.outerproduct(x, x)
        hlhMat = ComputeHLH(y, kernely)
        return numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
    ## Objective of the biased HSIC when performing optimization over
    # the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param hlhMat The HLH matrix on the labels.
    #
    def ObjBiasedHSIC(self, param, x, kernelx, hlhMat):
        nx = x.shape
        kMat = kernelx.DotCacheKernel(x, param)
        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
    ## Gradient of the objective of the biased HSIC when performing
    # optimization over the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param hlhMat The HLH matrix on the labels.
    #
    def GradBiasedHISC(self, param, x, kernelx, hlhMat):
        nx = x.shape
        kMat = kernelx.GradDotCacheKernel(x, param)
        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data and the HLH matrix for the labels are already
    # computed.
    # @param kMat The kernel matrix for the data.
    # @param hlhMat The HLH matrix for the labels.
    #
    def BiasedHSICFast(self, kMat, hlhMat):
        nx = kMat.shape
        assert (kMat.shape == hlhMat.shape), \
               "Argument 1 and 2 have different shapes"
        return (kMat * hlhMat).sum() / ((nx[0]-1)*(nx[0]-1))
    ## Fast computation of the biased HSIC when the kernel matrix
    # for the labels can be decomposed into HLH = y * y' and the
    # rank of y is low
    # @param kMat The kernel matrix for the data.
    # @param y The HLH = y * y' for the labels.
    #
    def BiasedHSICFast2(self, kMat, y):
        nx = kMat.shape
        assert (kMat.shape[0] == y.shape[0]), \
               "Argument 1 and 2 have different shapes"
        return numpy.dot(y.T, numpy.dot(kMat, y)).trace() / ((nx[0]-1)*(nx[0]-1))
    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data K can be decomposed into K = x * x' and that 
    # for the labels can be decomposed into HLH = y * y' and the
    # rank of y is low (this will be useful after incomplete cholesky
    # factorization
    # @param x The K = x * x' for the data.
    # @param y The HLH = y * y' for the labels.
    #
    def BiasedHSICFast3(self, x, y):
        nx = x.shape
        assert (x.shape[0] == y.shape[0]), \
               "Argument 1 and 2 have different shapes"
        return (numpy.dot(x.T, y)**2).sum() / ((nx[0]-1)*(nx[0]-1))   
    ## Compute the UNbiased estimator of HSIC.
    # @param x The data.
    # @param y The labels.
    # @param kernelx The kernel on the data, default to linear kernel.
    # @param kernely The kernel on the labels, default to linear kernel.
    #
    def UnBiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
                     kernely=vector.CLinearKernel()):
        nx = x.shape
        ny = y.shape
        assert nx[0] == ny[0], \
               "Argument 1 and 2 have different number of data points"
        kMat = kernelx.Dot(x,x)
        setdiag0(kMat)
        lMat = kernely.Dot(y,y)
        setdiag0(lMat)
        sK = kMat.sum(axis=1)
        ssK = sK.sum()
        sL = lMat.sum(axis=1)
        ssL = sL.sum()
        return ( kMat.__imul__(lMat).sum() + \
                 (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) - \
                 2 * sK.__imul__(sL).sum() / (nx[0]-2) \
                 ) / (nx[0]*(nx[0]-3))
    ## Objective of the UNbiased HSIC when performing optimization over
    # the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def ObjUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
        nx = x.shape    
        kMat = kernelx.DotCacheKernel(x, param)
        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)
        return -( numpy.sum(numpy.sum(kMat*lMat)) \
                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                  ) / (nx[0]*(nx[0]-3))
    ## Gradient of the objective of the UNbiased HSIC when performing
    # optimization over the kernel parameters.
    # @param param The kernel parameters.
    # @param x The data.
    # @param kernelx The kernel on the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def GradUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
        nx = x.shape
        kMat = kernelx.GradDotCacheKernel(x, param)
        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)
        return -( numpy.sum(numpy.sum(kMat*lMat)) \
                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                  ) / (nx[0]*(nx[0]-3))
    ## Fast computation of the biased HSIC when the kernel matrix
    # for the data and the HLH matrix for the labels are already
    # computed.
    # @param kMat The kernel matrix for the data.
    # @param lMat The kernel matrix of the label.
    # @param sL The vector of the sum of each row of lMat.
    # @param ssL The vector of the sum of all entries in lMat.
    #
    def UnBiasedHSICFast(self, kMat, lMat, sL, ssL):
        nx = kMat.shape
        assert (kMat.shape == lMat.shape), \
               "Argument 1 and 2 have different shapes"
        sK = numpy.sum(kMat, axis=1)
        ssK = numpy.sum(sK)
        return ( numpy.sum(numpy.sum(kMat*lMat)) \
                 + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
                 - 2*numpy.sum(sK*sL)/(nx[0]-2) \
                 ) / (nx[0]*(nx[0]-3))
 ## Normalize each dimension of the data separately to zero mean and unit
 # standard deviation.
 # @param data [read\write] The data to be normalized. Each row is a
 # datum and each column a dimension.
 #
 def normalize(data):
    m = data.mean(axis=0)
    s = data.std(axis=0)
    data.__isub__(m).__itruediv__(s)
 ## Center the kernel matrix in the feature space.
 # @param k [read\write] The kernel matrix to be centered. 
 #
 def center(k):
    n = k.shape
    assert n[0] == n[1], 'k must be symmetric and positive semidefinite'    
    mk = k.mean(axis=1)
    mk.shape = (n[0], 1)
    mmk = mk.mean()
    k.__isub__(mk).__isub__(mk.T).__iadd__(mmk)
@@ -0,0 +1,30 @@
 regression hsic linear
 regression_linear.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0462957047621
 regression_linear.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0619267723098
 regression_linear.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.119146367285
 regression_linear.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.108100811067
 regression hsic bayesian
 regression_bayesian.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0574420849846
 regression_bayesian.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0744427035158
 regression_bayesian.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.131973261761
 regression_bayesian.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.1292239022
 regression hsic dtree
 regression_dtree.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0616359499887
 regression_dtree.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0870560437385
 regression_dtree.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.225990113056
 regression_dtree.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.122724406103
 regression pca linear
 regression_linear.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0525417950272
 regression_linear.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0630462800532
 regression_linear.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.11837933335
 regression_linear.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.105251711141
 regression pca bayesian
 regression_bayesian.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0598334909344
 regression_bayesian.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0741352839217
 regression_bayesian.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.122554051214
 regression_bayesian.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.122681510711
 regression pca dtree
 regression_dtree.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0547733949265
 regression_dtree.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0922293538752
 regression_dtree.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.19924179118
 regression_dtree.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.0951762860752
@@ -0,0 +1,38 @@
 # Copyright (c) 2007, National ICT Australia
 # All rights reserved.
 #
 # The contents of this file are subject to the Mozilla Public License Version
 # 1.1 (the 'License'); you may not use this file except in compliance with
 # the License. You may obtain a copy of the License at
 # http://www.mozilla.org/MPL/
 #
 # Software distributed under the License is distributed on an 'AS IS' basis,
 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 # for the specific language governing rights and limitations under the
 # License.
 #
 # Authors: Christfried Webers
 # Created: (09/10/2007)
 # Last Updated: 
 #
 ## Exception classes for the Elefant project
 class CElefantException(Exception):
    """Base class for exceptions in Elefant."""
    pass
 class CElefantConstraintException(CElefantException):
    """Exception raised for constraint violation.
       Attributes:
            value   -- input value violating constrained
            message -- explanation of the error
    """
    def __init__(self, value, message):
        self.value = value
        self.message = message
@@ -0,0 +1,30 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn.linear_model import BayesianRidge, LinearRegression
 from sklearn.cross_validation import train_test_split
 usage = "filename features_file labels_file output_file"
 if __name__ == "__main__":
 	if (len(sys.argv)!=5):
 		print usage
 	else:
 		file_x = sys.argv[1]
 		file_y = sys.argv[2]
 		file_out = sys.argv[3]
 		split_seed = sys.argv[4]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		# Split the data into training/testing sets
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
 		# Bayesian Ridge Regression
 		clf = BayesianRidge(compute_score=True)
 		clf.fit(X, y)
 		y_predict=clf.predict(X_test)
 		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.cross_validation import train_test_split
 usage = "filename features_file labels_file output_file"
 if __name__ == "__main__":
 	if (len(sys.argv)!=5):
 		print usage
 	else:
 		file_x = sys.argv[1]
 		file_y = sys.argv[2]
 		file_out = sys.argv[3]
 		split_seed = sys.argv[4]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		# Split the data into training/testing sets
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
 		# Decision tree regressor
 		clf = DecisionTreeRegressor(max_depth=2)
 		clf.fit(X, y)
 		y_predict = clf.predict(X_test)
 		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn import linear_model
 from sklearn.cross_validation import train_test_split
 usage = "filename features_file labels_file output_file split_seed"
 if __name__ == "__main__":
 	if (len(sys.argv)!=5):
 		print usage
 	else:
 		file_x = sys.argv[1]
 		file_y = sys.argv[2]
 		file_out = sys.argv[3]
 		split_seed = sys.argv[4]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		# Split the data into training/testing sets
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
 		# Linear regression object
 		regr = linear_model.LinearRegression()
 		regr.fit(X_train, y_train)
 		y_predict = regr.predict(X_test)
 		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn import linear_model
 from sklearn.cross_validation import train_test_split
 usage = "filename features_file labels_file output_file"
 if __name__ == "__main__":
 	if (len(sys.argv)!=5):
 		print usage
 	else:
 		file_x = sys.argv[1]
 		file_y = sys.argv[2]
 		file_out = sys.argv[3]
 		split_seed = sys.argv[4]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		# Split the data into training/testing sets
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
 		# Logistic regression
 		regr = linear_model.LogisticRegression()
 		regr.fit(X_train, y_train)
 		y_predict = regr.predict(X_test)
 		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,33 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn.svm import SVR
 from sklearn.cross_validation import train_test_split
 usage = "filename features_file labels_file output_file"
 if __name__ == "__main__":
 	if (len(sys.argv)!=5):
 		print usage
 	else:
 		file_x = sys.argv[1]
 		file_y = sys.argv[2]
 		file_out = sys.argv[3]
 		split_seed = sys.argv[4]
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		# Split the data into training/testing sets
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
 		#support vector regression
 		svr = SVR(kernel='linear', C=1e3)
 		svr.fit(X, y)
 		y_predict=svr.predict(X_test)
 		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,121 @@
 #generate raw files
 echo "generating raw files"
 python cPickleparser.py
 python features_sampled.py
 #genertate features
 echo "generating bahsic features"
 python features_bahsic.py "data/features_raw.dat" "data/labels_0.dat" "data/features_bahsic_0.dat" "data/features_normalized.dat"
 python features_bahsic.py "data/features_raw.dat" "data/labels_1.dat" "data/features_bahsic_1.dat"
 python features_bahsic.py "data/features_raw.dat" "data/labels_2.dat" "data/features_bahsic_2.dat"
 python features_bahsic.py "data/features_raw.dat" "data/labels_3.dat" "data/features_bahsic_3.dat"
 echo "generating rrt features"
 echo "Run the matlab file to generate Recht and Rahimi Random Fourier features"
 echo "generating downsampled features"
 python features_sampled.py "data/features_sampled.dat"
 # do a train-test split
 python split_data.py "data/features_raw.dat" "data/labels_0.dat" "data/labels_test_0.dat"
 python split_data.py "data/features_raw.dat" "data/labels_1.dat" "data/labels_test_1.dat"
 python split_data.py "data/features_raw.dat" "data/labels_2.dat" "data/labels_test_2.dat"
 python split_data.py "data/features_raw.dat" "data/labels_3.dat" "data/labels_test_3.dat"
 #BAHSIC
 echo "regression bahsic linear"
 python regression_linear.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_linear_0.dat 42"
 python regression_linear.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_linear_1.dat 42"
 python regression_linear.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_linear_2.dat 42"
 python regression_linear.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_linear_3.dat 42"
 echo "regression bahsic bayesian"
 python regression_bayesian.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_bayesian_0.dat 42"
 python regression_bayesian.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_bayesian_1.dat 42"
 python regression_bayesian.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_bayesian_2.dat 42"
 python regression_bayesian.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_bayesian_3.dat 42"
 echo "regression bahsic dtree"
 python regression_dtree.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_dtree_0.dat 42"
 python regression_dtree.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_dtree_1.dat 42"
 python regression_dtree.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_dtree_2.dat 42"
 python regression_dtree.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_dtree_3.dat 42"
 echo "regression bahsic svr"
 python regression_svr.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_svr_0.dat 42"
 python regression_svr.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_svr_1.dat 42"
 python regression_svr.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_svr_2.dat 42"
 python regression_svr.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_svr_3.dat 42"
 echo "regression bahsic logistic"
 python regression_logistic.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_logistic_0.dat 42"
 python regression_logistic.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_logistic_1.dat 42"
 python regression_logistic.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_logistic_2.dat 42"
 python regression_logistic.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_logistic_3.dat 42"
 #rrt
 echo "regression rrt linear"
 python regression_linear.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_linear_0.dat"
 python regression_linear.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_linear_1.dat"
 python regression_linear.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_linear_2.dat"
 python regression_linear.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_linear_3.dat"
 echo "regression rrt bayesian"
 python regression_bayesian.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_bayesian_0.dat"
 python regression_bayesian.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_bayesian_1.dat"
 python regression_bayesian.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_bayesian_2.dat"
 python regression_bayesian.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_bayesian_3.dat"
 echo "regression rrt dtree"
 python regression_dtree.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_dtree_0.dat"
 python regression_dtree.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_dtree_1.dat"
 python regression_dtree.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_dtree_2.dat"
 python regression_dtree.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_dtree_3.dat"
 echo "regression rrt svr"
 python regression_svr.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_svr_0.dat"
 python regression_svr.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_svr_1.dat"
 python regression_svr.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_svr_2.dat"
 python regression_svr.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_svr_3.dat"
 echo "regression rrt logistic"
 python regression_logistic.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_logistic_0.dat"
 python regression_logistic.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_logistic_1.dat"
 python regression_logistic.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_logistic_2.dat"
 python regression_logistic.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_logistic_3.dat"
 #sampled 
 echo "regression sampled linear"
 python regression_linear.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_linear_0.dat"
 python regression_linear.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_linear_1.dat"
 python regression_linear.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_linear_2.dat"
 python regression_linear.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_linear_3.dat"
 echo "regression sampled bayesian"
 python regression_bayesian.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_bayesian_0.dat"
 python regression_bayesian.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_bayesian_1.dat"
 python regression_bayesian.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_bayesian_2.dat"
 python regression_bayesian.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_bayesian_3.dat"
 echo "regression sampled dtree"
 python regression_dtree.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_dtree_0.dat"
 python regression_dtree.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_dtree_1.dat"
 python regression_dtree.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_dtree_2.dat"
 python regression_dtree.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_dtree_3.dat"
 echo "regression sampled svr"
 python regression_svr.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_svr_0.dat"
 python regression_svr.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_svr_1.dat"
 python regression_svr.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_svr_2.dat"
 python regression_svr.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_svr_3.dat"
 echo "regression sampled logistic"
 python regression_logistic.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_logistic_0.dat"
 python regression_logistic.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_logistic_1.dat"
 python regression_logistic.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_logistic_2.dat"
 python regression_logistic.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_logistic_3.dat"
@@ -0,0 +1,17 @@
 #!/usr/bin/env python
 # Copyright (c) 2004 National ICT Australia --- All Rights Reserved
 # THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF SML.NICTA
 # The copyright notice above does not evidence any
 # actual or intended publication of this work.
 #
 # Authors:      Le Song
 # Last changed: 02/08/2006 (Christfried Webers)
 import numpy
 def setdiag0(K):
    """Set the diagonal entries of a square matrix to 0
    """
    n = K.shape[0]
    numpy.put(K, numpy.arange(n) * (n + 1), 0.0)
@@ -0,0 +1,24 @@
 import time
 import sys
 import numpy
 import vector
 from sklearn.cross_validation import train_test_split
 usage = "yolo"
 if __name__ == "__main__":
 	if (len(sys.argv)!=4):
 		print usage
 	else:
 		file_x = sys.argv[1];
 		file_y = sys.argv[2];
 		file_y_test = sys.argv[3];
 		X = numpy.genfromtxt(file_x, delimiter=' ')
 		y = numpy.genfromtxt(file_y, delimiter=' ')
 		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 		numpy.savetxt(file_y_test, y_test)
 		numpy.savetxt("data/features_train.dat",X_train)
 		numpy.savetxt("data/features_test.dat",X_test)