%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matlab script to perform cross validation on a model generated for % numeric data from agriculture % % purpose: data mining with SVMs (support vector machines) % % requires: % - statistics toolbox for cvpartition (from Matlab2008a) % - readColData script (see link below) % - SVMTorch (compiled version, of course) % - data structure used for the results (cv_results) % % new: % 2008-09-30: fully commented % ----- % Georg Ru{\ss} % russ@iws.cs.uni-magdeburg.de % 2008-09-30 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Preparation steps, workspace % clean workspace clear all; % set clock clock_start = clock; % seed random for reproducible results %rand('seed',1); % change paths to wherever your data is located and readable to matlab % uses script readColData from % http://web.cecs.pdx.edu/~gerry/MATLAB/plotting/loadingPlotData.html#colHeadings [label,id_column,data]=readColData('sorted_all',10,10,1); %% data-specific stuff % generate three data sets for the nnet to play with % one: N1, Yield2003, EM38 -- target: Yield2004 % two: N1, Yield2003, EM38, N2, REIP32 -- target: Yield2004 % three: N1, Yield2003, EM38, N2, REIP32, N3, REIP49 -- target: Yield2004 % works by eliminating the respective columns from the 'data' matrix above set_1 = data; set_1(:,7) = []; set_1(:,5) = []; set_1(:,4) = []; set_1(:,3) = []; set_1(:,2) = []; set_2 = data; set_2(:,7) = []; set_2(:,5) = []; set_2(:,3) = []; % only set_3 is actually used here set_3 = data; set_3(:,7) = []; Size_set_3 = size(set_3); %% modeling stage % partitioning parameters for cross validation k=10; % k-fold holdout cross validation p = 1/k; % generate data partition (cvpartition from matlab's statistics toolbox) cvdata = cvpartition(Size_set_3(1,1),'Holdout',p) j_results = struct('j',[],'mae',[],'rmse',[]); % loop for varying certain SVM parameters systematically for j = 1:20 % structured object to store actual, predicted and error values cv_results = struct('actual',[], 'prediction', [], 'abserr', [], 'squerr', []); j_results.j = vertcat(j_results.j,j); for i = 1:k; TrainSet = set_3(cvdata.training,:); TestSet = set_3(cvdata.test,:); % write training data to file dlmwrite('svm-train',size(TrainSet),' '); dlmwrite('svm-train',TrainSet,'-append','delimiter',' '); % write test data to file dlmwrite('svm-test',size(TestSet),' '); dlmwrite('svm-test',TestSet,'-append','delimiter',' '); % %run svmtorch as the model % requires writing a script file first ... fid = fopen('svmtorch.sh','w'); fprintf(fid,'%s\n%s%u%s\n','#! /bin/bash','SVMTorch -rm -t 2 -eps 0.3 -std ', j,' svm-train svm-model') fclose(fid) % ... and making it executable ... ! chmod 700 svmtorch.sh % ... before running it: % generate model ! ./svmtorch.sh % test model and store results in file 'svm-results' ! SVMTest -oa svm-results svm-model svm-test % remove model ! rm svm-model % read in model output on test values (from file) res_read = dlmread('svm-results'); % append values cv_results.prediction = vertcat(cv_results.prediction,res_read); cv_results.actual = vertcat(cv_results.actual,TestSet(:,8)); end % generate error measures and store them inside the cv_results struct cv_results.abserr = abs(cv_results.actual - cv_results.prediction); cv_results.squerr = (cv_results.abserr).^2; % calculate mae and rmse mae = mean(cv_results.abserr) rmse = sqrt(mean(cv_results.squerr)) % store mae and rmse for this particular svm parameter setting j_results.mae = vertcat(j_results.mae,mae); j_results.rmse = vertcat(j_results.rmse,rmse); end %% plot mae and rmse against j (parameter variation variable) clf; plot(j_results.j,j_results.mae,'--b'); hold on; plot(j_results.j,j_results.rmse,'-r'); legend('mae','rmse'); %% get script stats duration = etime(clock, clock_start)