Initial commit

2014-11-21 18:52:56 +05:30
commit 3fd37a416f
@@ -0,0 +1,48 @@
+dump/
+
+*.dat
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template 
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+
+# PyBuilder
+target/
@@ -0,0 +1,22 @@
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Rishi Dua <rishirdua@gmail.com>, TV Ashok <veeranjaneyaashok@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,28 @@
+EMOTION CLASSIFICATION
+======================
+Feature selection via dependence maximization for EEG based emotion classification. This project has been done as a part of Neural Networks course for the fall 2014 Semester at IIT Delhi.
+
+Supervisor: Dr. Jayadeva
+
+Authors
+-------
+Rishi Dua <http://github.com/rishirdua>
+TV Ashok <http://github.com/tvashok>
+
+Install
+-------
+1. Install Python and scikit
+2. Copy the DEAP dataset (cPickle preprocessed) to data/raw folder
+3. Run script.sh
+
+Documentation
+-------------
+Refer docs/readme.pdf
+
+Contribute
+----------
+- Source Code: https://github.com/rishirdua/emotion-classification/
+
+License
+-------
+This project is licensed under the terms of the MIT license. See LCENCE.txt for details
@@ -0,0 +1,190 @@
+# Copyright (c) 2006, National ICT Australia
+# All rights reserved.
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the 'License'); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an 'AS IS' basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# Authors: Le Song (lesong@it.usyd.edu.au)
+# Created: (20/10/2006)
+# Last Updated: (dd/mm/yyyy)
+#
+
+##\package elefant.fselection.bahsic
+# This module perform backward elimination for feature selection
+# using HSIC (BAHSIC).
+#
+# The algorithm proceeds recursively, eliminating the least
+# relevant features and adding them to the eliminated list
+# in each iteration. For more theoretical underpinning see the
+# following reference for more information:
+#
+# Le Song, Justin Bedo, Karsten M. Borgwardt, Arthur Gretton
+# and Alex Smola. The BAHSIC family of gene selection algorithms.
+#
+
+__version__ = '$Revision: $' 
+# $Source$
+
+import numpy
+from scipy import optimize
+
+import vector
+from hsic import CHSIC
+from setdiag0 import setdiag0
+
+
+## Class that perform backward elimination for feature selection (BAHSIC).
+#
+# It has two version of BAHSIC: one without optimization over the kernel
+# parameters and one with optimization over the kernel parameters.
+#
+class CBAHSIC(object):
+    def __init__(self):
+        pass
+
+    ## BAHSIC with optimization over the kernel parameters.
+    # @param x The data.
+    # @param y The labels.
+    # @param kernelx The kernel on the data.
+    # @param kernely The kernel on the labels.
+    # @param flg3 The number of desired features.
+    # @param flg4 The proportion of features eleminated in each iteration.
+    #
+    def BAHSICOpt(self, x, y, kernelx, kernely, flg3, flg4):
+        assert len(x.shape) == 2, 'Argument 1 has wrong shape'
+        assert len(y.shape) == 2, 'Argument 2 has wrong shape'
+        assert x.shape[0] == y.shape[0], \
+               'Argument 1 and 2 have different number of data points'
+                       
+        print '--initializing...'
+        hsic = CHSIC()
+        
+        L = kernely.Dot(y, y)
+        setdiag0(L)
+        sL = numpy.sum(L, axis=1)
+        ssL = numpy.sum(sL)
+
+        n = x.shape
+        eliminatedI = []
+        selectedI = set(numpy.arange(n[1]))
+
+        kernelx.CreateCacheKernel(x)
+        sga = kernelx._typicalParam
+        sgaN = sga.shape
+        sgaN = sgaN[0]
+
+        while True:        
+            selectedI = selectedI - set(eliminatedI)
+            sI = numpy.array([j for j in selectedI])
+            m = len(sI)
+
+            print m
+            if (m == 1):
+                eliminatedI.append(selectedI.pop())
+                break
+
+            sgaMat = []
+            hsicMat = []
+            for k in range(sgaN):
+                ## bfgs in scipy is not working here
+                retval = optimize.fmin_cg(hsic.ObjUnBiasedHSIC, \
+                                          sga[[k],].ravel(), \
+                                          hsic.GradUnBiasedHSIC,\
+                                          args=[x, kernelx, L, sL, ssL], \
+                                          gtol=1e-6, maxiter=100, \
+                                          full_output=True, disp=False)
+                sgaMat.append(retval[0])
+                hsicMat.append(retval[1])
+                    
+            k = numpy.argmin(hsicMat)
+            sga0 = sgaMat[k]
+            
+            objj = []
+            for j in selectedI:
+                K = kernelx.DecDotCacheKernel(x, x[:,[j]], sga0)
+                setdiag0(K)
+                objj.append(hsic.UnBiasedHSICFast(K, L, sL, ssL))
+
+            if m > flg3:
+                maxj = numpy.argsort(objj)
+                num = int(flg4 * m)+1
+                if m - num <= flg3:
+                    num = m - flg3
+                maxj = maxj[m:m-num-1:-1]
+            else:
+                maxj = numpy.array([numpy.argmax(objj)])
+                
+            j = numpy.take(sI,maxj)
+            eliminatedI.extend(j)
+            kernelx.DecCacheKernel(x, x[:,j])
+
+        kernelx.ClearCacheKernel(x)
+        return eliminatedI
+
+    ## BAHSIC without optimization over the kernel parameters.
+    # @param x The data.
+    # @param y The labels.
+    # @param kernelx The kernel on the data.
+    # @param kernely The kernel on the labels.
+    # @param flg3 The number of desired features.
+    # @param flg4 The proportion of features eleminated in each iteration.
+    #
+    def BAHSICRaw(self, x, y, kernelx, kernely, flg3, flg4):
+        assert len(x.shape) == 2, 'Argument 1 has wrong shape'
+        assert len(y.shape) == 2, 'Argument 2 has wrong shape'
+        assert x.shape[0] == y.shape[0], \
+               'Argument 1 and 2 have different number of data points'       
+
+        print '--initializing...'
+        hsic = CHSIC()
+
+        L = kernely.Dot(y, y)
+        setdiag0(L)
+
+        sL = numpy.sum(L, axis=1)
+        ssL = numpy.sum(sL)
+
+        n = x.shape
+        eliminatedI = []
+        selectedI = set(numpy.arange(n[1]))
+
+        kernelx.CreateCacheKernel(x)
+
+        while True:
+            selectedI = selectedI - set(eliminatedI)
+            sI = numpy.array([j for j in selectedI])
+            m = len(sI)
+
+            print m
+            if (m == 1):
+                eliminatedI.append(selectedI.pop())
+                break
+
+            objj = []
+            for j in selectedI:
+                K = kernelx.DecDotCacheKernel(x, x[:,[j]])
+                setdiag0(K)
+                objj.append(hsic.UnBiasedHSICFast(K, L, sL, ssL))
+
+            if m > flg3:
+                maxj = numpy.argsort(objj)
+                num = int(flg4 * m)+1
+                if m-num <= flg3:
+                    num = m - flg3
+                maxj = maxj[m:m-num-1:-1]
+            else:
+                maxj = numpy.array([numpy.argmax(objj)])
+
+            j = numpy.take(sI,maxj)
+            eliminatedI.extend(j)
+            kernelx.DecCacheKernel(x, x[:,j])
+
+        kernelx.ClearCacheKernel(x)
+        return eliminatedI
@@ -0,0 +1,46 @@
+import cPickle
+import os.path
+from multiprocessing import Pool
+import sys
+
+
+def main():
+	nLabel, nTrial, nUser, nChannel, nTime  = 4, 40, 32, 40, 8064
+	#new_array = [[[None] *w for i in range(h)] for j in range(l)]
+	print "Program started"+"\n"
+	fout_data = open("data/features_raw.dat",'w')
+	fout_labels0 = open("data/labels_0.dat",'w')
+	fout_labels1 = open("data/labels_1.dat",'w')
+	fout_labels2 = open("data/labels_2.dat",'w')
+	fout_labels3 = open("data/labels_3.dat",'w')
+	for i in range(nUser):#4, 40, 32, 40, 8064
+		if(i%8 == 0):
+			if i < 10:
+				name = '%0*d' % (2,i+1)
+			else:
+				name = i+1
+			fname = "data/raw/s"+str(name)+".dat"
+			x = cPickle.load(open(fname, 'rb'))
+			print fname
+			for tr in range(nTrial):
+				if(tr%1 == 0):
+					for dat in range(nTime):
+						if(dat%32 == 0):
+							for ch in range(nChannel):
+								#fout_data.write(str(ch+1) + " ");
+								fout_data.write(str(x['data'][tr][ch][dat]) + " ");
+					fout_labels0.write(str(x['labels'][tr][0]) + "\n");
+					fout_labels1.write(str(x['labels'][tr][1]) + "\n");
+					fout_labels2.write(str(x['labels'][tr][2]) + "\n");
+					fout_labels3.write(str(x['labels'][tr][3]) + "\n");
+					fout_data.write("\n");
+	fout_labels0.close()
+	fout_labels1.close()
+	fout_labels2.close()
+	fout_labels3.close()
+	fout_data.close()
+	
+	print "\n"+"Print Successful"
+
+if __name__ == "__main__":
+	main()
@@ -0,0 +1,56 @@
+import time
+import sys
+import numpy
+import vector
+from bahsic import CBAHSIC
+
+usage = "yolo"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)<4):
+		print usage
+	else:
+		file_x = sys.argv[1];
+		file_y = sys.argv[2];
+		file_out = sys.argv[3];
+		if (sys.argv==5):
+			file_normalized = sys.argv[5]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+		
+		bahsic = CBAHSIC()
+		data_no = 160
+		features_tokeep = 5040
+		y.shape = (data_no,1)
+
+		# Normalize the labels.
+		y = 1.0*y
+		tmp_no = numpy.sum(y)
+		pno = (data_no + tmp_no) / 2
+		nno = (data_no - tmp_no) / 2
+		y[y>0] = y[y>0]/pno
+		y[y<0] = y[y<0]/nno
+
+		# Normalize the data. 
+		m = X.mean(0)
+		s = X.std(0)
+		X.__isub__(m).__idiv__(s)
+
+		t1 = time.clock()
+		tmp = bahsic.BAHSICRaw(X, y, vector.CLinearKernel(), vector.CLinearKernel(), features_tokeep, 0.1)
+		t2 = time.clock()
+		print "time taken: "+str(t2-t1)
+		print '--rank of the features'
+		print '--better features towards the end of the list:'
+		print tmp
+
+		hsicfeatures= numpy.zeros(shape=(data_no,features_tokeep))
+		for i in range(0,data_no):
+			for j in range(0,features_tokeep):
+				hsicfeatures[i][j] = X[i][tmp[features_tokeep+j]]
+
+		numpy.savetxt(file_out, hsicfeatures)
+		if (sys.argv==5):
+			numpy.savetxt('original.csv', X)
@@ -0,0 +1,39 @@
+
+
+%% DESCRIPTION
+
+clc;
+clear all;
+
+% Transforms features into Recht and Rahimi’s Random Fourier Feature as defined in:
+% Rahimi, Ali, and Benjamin Recht. "Random features for large-scale kernel machines." In Advances in neural information processing systems, pp. 1177-1184. 2007.
+
+
+X = dlmread('data/features_raw.dat', ' ');
+
+n_features = size(X,2);
+n_data = size(X,1);
+
+gamma_inv = 0.1;
+gamma = 1/gamma_inv;
+sigma = sqrt(2/gamma_inv);
+n_randomfeatures = 5040;
+%calculate
+W=normrnd(0,sigma,n_features,n_randomfeatures);
+b=2*pi*rand(1,n_randomfeatures);
+B = ones(n_data,1)*(b);
+X_rrt = sqrt(2/n_randomfeatures)*cos(X*W+B);
+disp('calculated');
+
+%normalize
+%mean_tr = mean(Data_new);
+%std_tr = std(Data_new);
+%Data_new = (Data_new-repmat(mean_tr,n_data,1))./(repmat(std_tr,n_data,1));
+%disp('normalized');
+%toc;
+
+%write
+
+dlmwrite('data/features_rrt.dat',X_rrt, ' ');
+
+disp('writen to file');
@@ -0,0 +1,36 @@
+import cPickle
+import os.path
+from multiprocessing import Pool
+import sys
+
+usage = "filename out_file"
+
+def generate_features(fout_file):
+	nLabel, nTrial, nUser, nChannel, nTime  = 4, 40, 32, 40, 8064
+	#new_array = [[[None] *w for i in range(h)] for j in range(l)]
+	print "Program started"+"\n"
+	fout_data = open(fout_file,'w')
+	for i in range(nUser):#4, 40, 32, 40, 8064
+		if(i%8 == 0):
+			if i < 10:
+				name = '%0*d' % (2,i+1)
+			else:
+				name = i+1
+			fname = "data/raw/s"+str(name)+".dat"
+			x = cPickle.load(open(fname, 'rb'))
+			print fname
+			for tr in range(nTrial):
+				if(tr%1 == 0):
+					for dat in range(nTime):
+						if(dat%64 == 0):
+							for ch in range(nChannel):
+								#fout_data.write(str(ch+1) + " ");
+								fout_data.write(str(x['data'][tr][ch][dat]) + " ");
+					fout_data.write("\n");
+	fout_data.close()
+
+if __name__ == "__main__":
+	if (len(sys.argv)!=2):
+		print usage
+	else:
+		generate_features(sys.argv[1])
@@ -0,0 +1,168 @@
+# Copyright (c) 2006, National ICT Australia
+# All rights reserved.
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# Authors: Le Song (lesong@it.usyd.edu.au) and Alex Smola
+# (alex.smola@nicta.com.au)
+# Created: (20/10/2006)
+# Last Updated: (dd/mm/yyyy)
+#
+
+##\package elefant.kernels.generic
+# This module contains generic class for kernels
+#
+# The CKernel class provides common interface for all kernel classes. Note
+# that it should never be instantiated.
+#
+
+__version__ = "$Revision: $" 
+# $Source$ 
+
+import numpy
+import numpy.random as random
+
+## Generic kernel class
+#
+# This kernel provide common interface for all kernels. This interface
+# includes the following key kernel manipulations (functions):
+# --Dot(x1, x2): $K(x1, x2)$
+# --Expand(x1, x2, alpha): $sum_r K(x1_i,x2_r) \times alpha2_r$
+# --Tensor(x1, y1, x2, y2): $K(x1_i,x2_j) \times (y1_i \times y1_j)$
+# --TensorExpand(x1, y1, x2, y2, alpha2):
+# $sum_r K(x1_i,x2_r) \times (y1_i \times y1_r) \times alpha2_r$
+# --Remember(x): Remember data x
+# --Forget(x): Remove remembered data x
+# To design a specific kernel, simply overload these methods. The generic
+# kernel itself should never be instantiated. 
+#
+class CKernel(object):
+    def __init__(self, blocksize=128):
+        ## @var _blocksize
+        # Parameter that determines the size of each block when computing the
+        # kernel matrix in blocks. Properly blocking the kernel matrix during
+        # computation improves the speed.
+        #
+        self._blocksize = blocksize
+        ## @var _name
+        # Name of the kernel.
+        #
+        self._name = "Generic kernel"
+        ## @var _cacheData
+        # Cache that stores data that have appeared before.
+        #
+        self._cacheData = {}
+        
+    def __str__(self):
+        return self._name
+    
+    def __repr__(self):
+        return "Kernel object of type '" + self._name + "'"
+
+    ## Compute the kernel between two data points x1 and x2.
+    # It returns a scale value of dot product between x1 and x2.
+    # @param x1 [read] The first data point.
+    # @param x2 [read] The second data point.
+    #
+    def K(self, x1, x2):
+        raise NotImplementedError, \
+              'CKernel.K in abstract class is not implemented'
+    
+    ## Compute the kernel between the data points in x1 and those in x2.
+    # It returns a matrix with entry $(ij)$ equal to $K(x1_i, x1_j)$.
+    # If index1/index2 is
+    # specified, only those data points in x1/x2 with indices corresponding
+    # to index1/index2 are used to compute the kernel matrix. Furthermore,
+    # if output is specified, the provided buffer is used explicitly to
+    # store the kernel matrix.
+    # @param x1 [read] The first set of data points.
+    # @param x2 [read] The second set of data points.
+    # @param index1 [read] The indices into the first set of data points. 
+    # @param index2 [read] The indices into the second set of data points.
+    # @param output [write] The buffer where the output matrix is written into.
+    #
+    def Dot(self, x1, x2, index1=None, index2=None, output=None):
+        raise NotImplementedError, \
+              'CKernel.Dot in abstract class is not implemented' 
+
+    ## Compute the kernel between the data points in x1 and those in x2,
+    # then multiply the resulting kernel matrix by alpha2.
+    # It returns a matrix with entry $(ij)$ equal to
+    # $sum_r K(x1_i,x2_r) \times alpha2_r$.
+    # Other parameters are defined similarly as those in Dot. 
+    # @param x1 [read] The first set of data points.
+    # @param x2 [read] The second set of data points.
+    # @param alpha2 [read] The set of coefficients.
+    # @param index1 [read] The indices into the first set of data points. 
+    # @param index2 [read] The indices into the second set of data points.
+    # @param output [write] The buffer where the output matrix is written into.
+    #
+    def Expand(self, x1, x2, alpha2, index1=None, index2=None, output=None):
+        raise NotImplementedError, \
+              'CKernel.Expand in abstract class is not implemented' 
+
+    ## Compute the kernel between the data points in x1 and those in x2,
+    # then multiply the resulting kernel matrix elementwiesely by the
+    # the outer-product matrix between y1 and y2. It returns a matrix
+    # with entry $(ij)$ equal to $K(x1_i,x2_j) \times (y1_i \times y1_j)$.
+    # Other parameters are defined similarly as those in Dot. 
+    # @param x1 [read] The first set of data points.
+    # @param y1 [read] The first set of labels.
+    # @param x2 [read] The second set of data points.
+    # @param y2 [read] The second set of labels.
+    # @param index1 [read] The indices into the first set of data points. 
+    # @param index2 [read] The indices into the second set of data points.
+    # @param output [write] The buffer where the output matrix is written into.
+    #
+    def Tensor(self, x1, y1, x2, y2, index1=None, index2=None, output=None):
+        raise NotImplementedError, \
+              'CKernel.Tensor in abstract class is not implemented' 
+
+    ## Compute the kernel between the data points in x1 and those in x2,
+    # then multiply the resulting kernel matrix elementwiesely by the
+    # the outer-product matrix between y1 and y2, and final multiply
+    # the resulting matrix by alpha2. It returns a matrix with entry $(ij)$
+    # equal to $sum_r K(x1_i,x2_r) \times (y1_i \times y1_r) \times alpha2_r$.
+    # Other parameters are defined similarly as those in Dot. 
+    # @param x1 [read] The first set of data points.
+    # @param y1 [read] The first set of labels.
+    # @param x2 [read] The second set of data points.
+    # @param y2 [read] The second set of labels.
+    # @param index1 [read] The indices into the first set of data points. 
+    # @param index2 [read] The indices into the second set of data points.
+    # @param output [write] The buffer where the output matrix is written into.
+    #
+    def TensorExpand(self, x1, y1, x2, y2, alpha2, index1=None, index2=None, \
+                     output=None):
+        raise NotImplementedError, \
+              'CKernel.TensorExpand in abstract class is not implemented'
+    
+    ## Remember the data by performing necessary preprossing on
+    # the data, storing it in the cache and indexing it by the id of
+    # the data. The preprocessing can be defined differently for
+    # different classes. If the data have already been remembered,
+    # the old stored information is simply overwritten.
+    # @param x [read] The data to be remembered.
+    #
+    def Remember(self, x):
+        raise NotImplementedError, \
+              'CKernel.Remember in abstract class is not implemented' 
+
+    ## Remove a remembered data from the cache. If x is not given, then
+    # all the data remembered in the cache  will be removed. If a given
+    # x is not remembered beforehand, False is returned; otherwise, True
+    # is returned. 
+    # @param x [read] The data to be removed.
+    #
+    def Forget(self, x=None):
+        raise NotImplementedError, \
+              'CKernel.Forget in abstract class is not implemented' 
+   
@@ -0,0 +1,267 @@
+# Copyright (c) 2006, National ICT Australia
+# All rights reserved.
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the 'License'); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an 'AS IS' basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# Authors: Le Song (lesong@it.usyd.edu.au)
+# Created: (20/10/2006)
+# Last Updated: (dd/mm/yyyy)
+#
+
+##\package elefant.fselection.hsic
+# This module perform computation related to Hilber-Schmidt Independence
+# Criterion. Hilber-Schmidt Independence Criterion is short for HSIC.
+#
+# HISC is defined as $HSIC=\frac{1}{m}Tr(KHLH)$, where $kMat$ and $lMat$
+# are the kernel matrices for the data and the labels respectively.
+# $H=I-\frac{1}{m}\delta_{ij}$, where $m$ is the number of data points,
+# is the centering matrix. The unbiased estimator of HSIC is computed as
+# $HSIC=\frac{1}{m(m-3)}\left[Tr(KL)+\frac{1}{(m-1)(m-2)}1^\top K11^\top L1
+# -\frac{2}{m-2}1^\top KL1\right]. For more theorectical underpinning
+# of HSIC, see the following reference:
+#
+# Gretton, A., O. Bousquet, A. Smola and B. Schoelkopf: Measuring
+# Statistical Dependence with Hilbert-Schmidt Norms. Algorithmic
+# Learning Theory: 16th International Conference, ALT 2005, 63-78, 2005.
+# 
+
+__version__ = '$Revision: $' 
+# $Source$
+
+import numpy
+import vector
+from setdiag0 import setdiag0
+
+## Class that perform computation related to HSIC.
+#
+# It contains function that computes biased and unbiased HSIC, part of HSIC
+# necessary for faster its faster computation, and functions that enable
+# an optimization on HSIC with respect to the kernel parameters.
+#
+class CHSIC(object):
+    def __init__(self):
+        pass
+
+    ## Compute HLH give the labels.
+    # @param y The labels.
+    # @param kernely The kernel on the labels, default to linear kernel.
+    #
+    def ComputeHLH(self, y, kernely=vector.CLinearKernel()):
+        ny = y.shape
+        if len(ny) > 1:
+            lMat = kernely.Dot(y, y)
+        else:
+            lMat = numpy.outerproduct(y, y)
+
+        sL = numpy.sum(lMat, axis=1)
+        ssL = numpy.sum(sL)
+        # hlhMat
+        return lMat - numpy.add.outer(sL, sL)/ny[0] + ssL/(ny[0]*ny[0])
+
+    ## Compute the biased estimator of HSIC.
+    # @param x The data.
+    # @param y The labels.
+    # @param kernelx The kernel on the data, default to linear kernel.
+    # @param kernely The kernel on the labels, default to linear kernel.
+    #
+    def BiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
+                   kernely=vector.CLinearKernel()):
+        nx = x.shape
+        ny = y.shape
+        assert nx[0] == ny[0], \
+               "Argument 1 and 2 have different number of data points"
+
+        if len(nx) > 1:
+            kMat = kernelx.Dot(x, x)
+        else:
+            kMat = numpy.outerproduct(x, x)
+
+        hlhMat = ComputeHLH(y, kernely)
+        return numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
+
+    ## Objective of the biased HSIC when performing optimization over
+    # the kernel parameters.
+    # @param param The kernel parameters.
+    # @param x The data.
+    # @param kernelx The kernel on the data.
+    # @param hlhMat The HLH matrix on the labels.
+    #
+    def ObjBiasedHSIC(self, param, x, kernelx, hlhMat):
+        nx = x.shape
+        kMat = kernelx.DotCacheKernel(x, param)
+        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
+
+    ## Gradient of the objective of the biased HSIC when performing
+    # optimization over the kernel parameters.
+    # @param param The kernel parameters.
+    # @param x The data.
+    # @param kernelx The kernel on the data.
+    # @param hlhMat The HLH matrix on the labels.
+    #
+    def GradBiasedHISC(self, param, x, kernelx, hlhMat):
+        nx = x.shape
+        kMat = kernelx.GradDotCacheKernel(x, param)
+        return -numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0]-1)*(nx[0]-1))
+
+    ## Fast computation of the biased HSIC when the kernel matrix
+    # for the data and the HLH matrix for the labels are already
+    # computed.
+    # @param kMat The kernel matrix for the data.
+    # @param hlhMat The HLH matrix for the labels.
+    #
+    def BiasedHSICFast(self, kMat, hlhMat):
+        nx = kMat.shape
+        assert (kMat.shape == hlhMat.shape), \
+               "Argument 1 and 2 have different shapes"
+
+        return (kMat * hlhMat).sum() / ((nx[0]-1)*(nx[0]-1))
+
+    ## Fast computation of the biased HSIC when the kernel matrix
+    # for the labels can be decomposed into HLH = y * y' and the
+    # rank of y is low
+    # @param kMat The kernel matrix for the data.
+    # @param y The HLH = y * y' for the labels.
+    #
+    def BiasedHSICFast2(self, kMat, y):
+        nx = kMat.shape
+        assert (kMat.shape[0] == y.shape[0]), \
+               "Argument 1 and 2 have different shapes"
+
+        return numpy.dot(y.T, numpy.dot(kMat, y)).trace() / ((nx[0]-1)*(nx[0]-1))
+
+    ## Fast computation of the biased HSIC when the kernel matrix
+    # for the data K can be decomposed into K = x * x' and that 
+    # for the labels can be decomposed into HLH = y * y' and the
+    # rank of y is low (this will be useful after incomplete cholesky
+    # factorization
+    # @param x The K = x * x' for the data.
+    # @param y The HLH = y * y' for the labels.
+    #
+    def BiasedHSICFast3(self, x, y):
+        nx = x.shape
+        assert (x.shape[0] == y.shape[0]), \
+               "Argument 1 and 2 have different shapes"
+
+        return (numpy.dot(x.T, y)**2).sum() / ((nx[0]-1)*(nx[0]-1))   
+    
+
+    ## Compute the UNbiased estimator of HSIC.
+    # @param x The data.
+    # @param y The labels.
+    # @param kernelx The kernel on the data, default to linear kernel.
+    # @param kernely The kernel on the labels, default to linear kernel.
+    #
+    def UnBiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \
+                     kernely=vector.CLinearKernel()):
+        nx = x.shape
+        ny = y.shape
+        assert nx[0] == ny[0], \
+               "Argument 1 and 2 have different number of data points"
+
+        kMat = kernelx.Dot(x,x)
+        setdiag0(kMat)
+
+        lMat = kernely.Dot(y,y)
+        setdiag0(lMat)
+
+        sK = kMat.sum(axis=1)
+        ssK = sK.sum()
+        sL = lMat.sum(axis=1)
+        ssL = sL.sum()
+
+        return ( kMat.__imul__(lMat).sum() + \
+                 (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) - \
+                 2 * sK.__imul__(sL).sum() / (nx[0]-2) \
+                 ) / (nx[0]*(nx[0]-3))
+
+    ## Objective of the UNbiased HSIC when performing optimization over
+    # the kernel parameters.
+    # @param param The kernel parameters.
+    # @param x The data.
+    # @param kernelx The kernel on the data.
+    # @param lMat The kernel matrix of the label.
+    # @param sL The vector of the sum of each row of lMat.
+    # @param ssL The vector of the sum of all entries in lMat.
+    #
+    def ObjUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
+        nx = x.shape    
+        kMat = kernelx.DotCacheKernel(x, param)
+        sK = numpy.sum(kMat, axis=1)
+        ssK = numpy.sum(sK)
+
+        return -( numpy.sum(numpy.sum(kMat*lMat)) \
+                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
+                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
+                  ) / (nx[0]*(nx[0]-3))
+
+
+    ## Gradient of the objective of the UNbiased HSIC when performing
+    # optimization over the kernel parameters.
+    # @param param The kernel parameters.
+    # @param x The data.
+    # @param kernelx The kernel on the data.
+    # @param lMat The kernel matrix of the label.
+    # @param sL The vector of the sum of each row of lMat.
+    # @param ssL The vector of the sum of all entries in lMat.
+    #
+    def GradUnBiasedHSIC(self, param, x, kernelx, lMat, sL, ssL):
+        nx = x.shape
+        kMat = kernelx.GradDotCacheKernel(x, param)
+        sK = numpy.sum(kMat, axis=1)
+        ssK = numpy.sum(sK)
+
+        return -( numpy.sum(numpy.sum(kMat*lMat)) \
+                  + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
+                  - 2*numpy.sum(sK*sL)/(nx[0]-2) \
+                  ) / (nx[0]*(nx[0]-3))
+
+    ## Fast computation of the biased HSIC when the kernel matrix
+    # for the data and the HLH matrix for the labels are already
+    # computed.
+    # @param kMat The kernel matrix for the data.
+    # @param lMat The kernel matrix of the label.
+    # @param sL The vector of the sum of each row of lMat.
+    # @param ssL The vector of the sum of all entries in lMat.
+    #
+    def UnBiasedHSICFast(self, kMat, lMat, sL, ssL):
+        nx = kMat.shape
+        assert (kMat.shape == lMat.shape), \
+               "Argument 1 and 2 have different shapes"
+
+        sK = numpy.sum(kMat, axis=1)
+        ssK = numpy.sum(sK)
+
+        return ( numpy.sum(numpy.sum(kMat*lMat)) \
+                 + (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) \
+                 - 2*numpy.sum(sK*sL)/(nx[0]-2) \
+                 ) / (nx[0]*(nx[0]-3))
+
+## Normalize each dimension of the data separately to zero mean and unit
+# standard deviation.
+# @param data [read\write] The data to be normalized. Each row is a
+# datum and each column a dimension.
+#
+def normalize(data):
+    m = data.mean(axis=0)
+    s = data.std(axis=0)
+    data.__isub__(m).__itruediv__(s)
+
+## Center the kernel matrix in the feature space.
+# @param k [read\write] The kernel matrix to be centered. 
+#
+def center(k):
+    n = k.shape
+    assert n[0] == n[1], 'k must be symmetric and positive semidefinite'    
+    mk = k.mean(axis=1)
+    mk.shape = (n[0], 1)
+    mmk = mk.mean()
+    k.__isub__(mk).__isub__(mk.T).__iadd__(mmk)
+
@@ -0,0 +1,30 @@
+regression hsic linear
+regression_linear.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0462957047621
+regression_linear.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0619267723098
+regression_linear.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.119146367285
+regression_linear.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.108100811067
+regression hsic bayesian
+regression_bayesian.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0574420849846
+regression_bayesian.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0744427035158
+regression_bayesian.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.131973261761
+regression_bayesian.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.1292239022
+regression hsic dtree
+regression_dtree.py data/features_hsic_0.dat data/labels_0.dat data/predict_hsic_0.dat 0.0616359499887
+regression_dtree.py data/features_hsic_1.dat data/labels_1.dat data/predict_hsic_1.dat 0.0870560437385
+regression_dtree.py data/features_hsic_2.dat data/labels_2.dat data/predict_hsic_2.dat 0.225990113056
+regression_dtree.py data/features_hsic_3.dat data/labels_3.dat data/predict_hsic_3.dat 0.122724406103
+regression pca linear
+regression_linear.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0525417950272
+regression_linear.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0630462800532
+regression_linear.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.11837933335
+regression_linear.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.105251711141
+regression pca bayesian
+regression_bayesian.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0598334909344
+regression_bayesian.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0741352839217
+regression_bayesian.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.122554051214
+regression_bayesian.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.122681510711
+regression pca dtree
+regression_dtree.py data/features_pca.dat data/labels_0.dat data/predict_pca_0.dat 0.0547733949265
+regression_dtree.py data/features_pca.dat data/labels_1.dat data/predict_pca_1.dat 0.0922293538752
+regression_dtree.py data/features_pca.dat data/labels_2.dat data/predict_pca_2.dat 0.19924179118
+regression_dtree.py data/features_pca.dat data/labels_3.dat data/predict_pca_3.dat 0.0951762860752
@@ -0,0 +1,38 @@
+# Copyright (c) 2007, National ICT Australia
+# All rights reserved.
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the 'License'); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an 'AS IS' basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# Authors: Christfried Webers
+# Created: (09/10/2007)
+# Last Updated: 
+#
+
+## Exception classes for the Elefant project
+
+class CElefantException(Exception):
+    """Base class for exceptions in Elefant."""
+    pass
+
+
+class CElefantConstraintException(CElefantException):
+    """Exception raised for constraint violation.
+    
+       Attributes:
+            value   -- input value violating constrained
+            message -- explanation of the error
+    """
+    
+    def __init__(self, value, message):
+        self.value = value
+        self.message = message
+        
+    
@@ -0,0 +1,30 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn.linear_model import BayesianRidge, LinearRegression
+from sklearn.cross_validation import train_test_split
+
+usage = "filename features_file labels_file output_file"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=5):
+		print usage
+	else:
+		file_x = sys.argv[1]
+		file_y = sys.argv[2]
+		file_out = sys.argv[3]
+		split_seed = sys.argv[4]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+
+		# Split the data into training/testing sets
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
+		
+		# Bayesian Ridge Regression
+		clf = BayesianRidge(compute_score=True)
+		clf.fit(X, y)
+		y_predict=clf.predict(X_test)
+		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.cross_validation import train_test_split
+
+usage = "filename features_file labels_file output_file"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=5):
+		print usage
+	else:
+		file_x = sys.argv[1]
+		file_y = sys.argv[2]
+		file_out = sys.argv[3]
+		split_seed = sys.argv[4]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+
+		# Split the data into training/testing sets
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
+
+		# Decision tree regressor
+		clf = DecisionTreeRegressor(max_depth=2)
+		clf.fit(X, y)
+		y_predict = clf.predict(X_test)
+		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn import linear_model
+from sklearn.cross_validation import train_test_split
+
+usage = "filename features_file labels_file output_file split_seed"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=5):
+		print usage
+	else:
+		file_x = sys.argv[1]
+		file_y = sys.argv[2]
+		file_out = sys.argv[3]
+		split_seed = sys.argv[4]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+		
+		# Split the data into training/testing sets
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
+
+		# Linear regression object
+		regr = linear_model.LinearRegression()
+		regr.fit(X_train, y_train)
+		y_predict = regr.predict(X_test)
+		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,30 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn import linear_model
+from sklearn.cross_validation import train_test_split
+
+usage = "filename features_file labels_file output_file"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=5):
+		print usage
+	else:
+		file_x = sys.argv[1]
+		file_y = sys.argv[2]
+		file_out = sys.argv[3]
+		split_seed = sys.argv[4]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+
+		# Split the data into training/testing sets
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
+		
+		# Logistic regression
+		regr = linear_model.LogisticRegression()
+		regr.fit(X_train, y_train)
+		y_predict = regr.predict(X_test)
+		numpy.savetxt(file_out, y_predict)
@@ -0,0 +1,33 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn.svm import SVR
+from sklearn.cross_validation import train_test_split
+
+usage = "filename features_file labels_file output_file"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=5):
+		print usage
+	else:
+		file_x = sys.argv[1]
+		file_y = sys.argv[2]
+		file_out = sys.argv[3]
+		split_seed = sys.argv[4]
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+
+		# Split the data into training/testing sets
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed)
+
+		#support vector regression
+		svr = SVR(kernel='linear', C=1e3)
+		svr.fit(X, y)
+		y_predict=svr.predict(X_test)
+		numpy.savetxt(file_out, y_predict)
+		
+
+		
@@ -0,0 +1,121 @@
+#generate raw files
+echo "generating raw files"
+python cPickleparser.py
+
+python features_sampled.py
+
+#genertate features
+
+echo "generating bahsic features"
+python features_bahsic.py "data/features_raw.dat" "data/labels_0.dat" "data/features_bahsic_0.dat" "data/features_normalized.dat"
+python features_bahsic.py "data/features_raw.dat" "data/labels_1.dat" "data/features_bahsic_1.dat"
+python features_bahsic.py "data/features_raw.dat" "data/labels_2.dat" "data/features_bahsic_2.dat"
+python features_bahsic.py "data/features_raw.dat" "data/labels_3.dat" "data/features_bahsic_3.dat"
+
+echo "generating rrt features"
+echo "Run the matlab file to generate Recht and Rahimi Random Fourier features"
+
+echo "generating downsampled features"
+python features_sampled.py "data/features_sampled.dat"
+
+# do a train-test split
+python split_data.py "data/features_raw.dat" "data/labels_0.dat" "data/labels_test_0.dat"
+python split_data.py "data/features_raw.dat" "data/labels_1.dat" "data/labels_test_1.dat"
+python split_data.py "data/features_raw.dat" "data/labels_2.dat" "data/labels_test_2.dat"
+python split_data.py "data/features_raw.dat" "data/labels_3.dat" "data/labels_test_3.dat"
+
+#BAHSIC
+echo "regression bahsic linear"
+python regression_linear.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_linear_0.dat 42"
+python regression_linear.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_linear_1.dat 42"
+python regression_linear.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_linear_2.dat 42"
+python regression_linear.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_linear_3.dat 42"
+
+echo "regression bahsic bayesian"
+python regression_bayesian.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_bayesian_0.dat 42"
+python regression_bayesian.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_bayesian_1.dat 42"
+python regression_bayesian.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_bayesian_2.dat 42"
+python regression_bayesian.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_bayesian_3.dat 42"
+
+echo "regression bahsic dtree"
+python regression_dtree.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_dtree_0.dat 42"
+python regression_dtree.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_dtree_1.dat 42"
+python regression_dtree.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_dtree_2.dat 42"
+python regression_dtree.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_dtree_3.dat 42"
+
+echo "regression bahsic svr"
+python regression_svr.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_svr_0.dat 42"
+python regression_svr.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_svr_1.dat 42"
+python regression_svr.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_svr_2.dat 42"
+python regression_svr.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_svr_3.dat 42"
+
+echo "regression bahsic logistic"
+python regression_logistic.py "data/features_bahsic_0.dat" "data/labels_0.dat" "predict/hsic_logistic_0.dat 42"
+python regression_logistic.py "data/features_bahsic_1.dat" "data/labels_1.dat" "predict/hsic_logistic_1.dat 42"
+python regression_logistic.py "data/features_bahsic_2.dat" "data/labels_2.dat" "predict/hsic_logistic_2.dat 42"
+python regression_logistic.py "data/features_bahsic_3.dat" "data/labels_3.dat" "predict/hsic_logistic_3.dat 42"
+
+#rrt
+
+echo "regression rrt linear"
+python regression_linear.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_linear_0.dat"
+python regression_linear.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_linear_1.dat"
+python regression_linear.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_linear_2.dat"
+python regression_linear.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_linear_3.dat"
+
+echo "regression rrt bayesian"
+python regression_bayesian.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_bayesian_0.dat"
+python regression_bayesian.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_bayesian_1.dat"
+python regression_bayesian.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_bayesian_2.dat"
+python regression_bayesian.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_bayesian_3.dat"
+
+echo "regression rrt dtree"
+python regression_dtree.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_dtree_0.dat"
+python regression_dtree.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_dtree_1.dat"
+python regression_dtree.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_dtree_2.dat"
+python regression_dtree.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_dtree_3.dat"
+
+echo "regression rrt svr"
+python regression_svr.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_svr_0.dat"
+python regression_svr.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_svr_1.dat"
+python regression_svr.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_svr_2.dat"
+python regression_svr.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_svr_3.dat"
+
+echo "regression rrt logistic"
+python regression_logistic.py "data/features_rrt.dat" "data/labels_0.dat" "predict/rrt_logistic_0.dat"
+python regression_logistic.py "data/features_rrt.dat" "data/labels_1.dat" "predict/rrt_logistic_1.dat"
+python regression_logistic.py "data/features_rrt.dat" "data/labels_2.dat" "predict/rrt_logistic_2.dat"
+python regression_logistic.py "data/features_rrt.dat" "data/labels_3.dat" "predict/rrt_logistic_3.dat"
+
+#sampled 
+
+echo "regression sampled linear"
+python regression_linear.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_linear_0.dat"
+python regression_linear.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_linear_1.dat"
+python regression_linear.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_linear_2.dat"
+python regression_linear.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_linear_3.dat"
+
+echo "regression sampled bayesian"
+python regression_bayesian.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_bayesian_0.dat"
+python regression_bayesian.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_bayesian_1.dat"
+python regression_bayesian.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_bayesian_2.dat"
+python regression_bayesian.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_bayesian_3.dat"
+
+echo "regression sampled dtree"
+python regression_dtree.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_dtree_0.dat"
+python regression_dtree.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_dtree_1.dat"
+python regression_dtree.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_dtree_2.dat"
+python regression_dtree.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_dtree_3.dat"
+
+echo "regression sampled svr"
+python regression_svr.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_svr_0.dat"
+python regression_svr.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_svr_1.dat"
+python regression_svr.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_svr_2.dat"
+python regression_svr.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_svr_3.dat"
+
+echo "regression sampled logistic"
+python regression_logistic.py "data/features_sampled.dat" "data/labels_0.dat" "predict/sampled_logistic_0.dat"
+python regression_logistic.py "data/features_sampled.dat" "data/labels_1.dat" "predict/sampled_logistic_1.dat"
+python regression_logistic.py "data/features_sampled.dat" "data/labels_2.dat" "predict/sampled_logistic_2.dat"
+python regression_logistic.py "data/features_sampled.dat" "data/labels_3.dat" "predict/sampled_logistic_3.dat"
+
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2004 National ICT Australia --- All Rights Reserved
+# THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF SML.NICTA
+# The copyright notice above does not evidence any
+# actual or intended publication of this work.
+#
+# Authors:      Le Song
+# Last changed: 02/08/2006 (Christfried Webers)
+
+import numpy
+
+def setdiag0(K):
+    """Set the diagonal entries of a square matrix to 0
+    """
+    n = K.shape[0]
+    numpy.put(K, numpy.arange(n) * (n + 1), 0.0)
@@ -0,0 +1,24 @@
+import time
+import sys
+import numpy
+import vector
+from sklearn.cross_validation import train_test_split
+
+usage = "yolo"
+
+if __name__ == "__main__":
+
+	if (len(sys.argv)!=4):
+		print usage
+	else:
+		file_x = sys.argv[1];
+		file_y = sys.argv[2];
+		file_y_test = sys.argv[3];
+
+		X = numpy.genfromtxt(file_x, delimiter=' ')
+		y = numpy.genfromtxt(file_y, delimiter=' ')
+		
+		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+		numpy.savetxt(file_y_test, y_test)
+		numpy.savetxt("data/features_train.dat",X_train)
+		numpy.savetxt("data/features_test.dat",X_test)