Вы находитесь на странице: 1из 11

http://mrmartin.net/?

p=223

view source

print?
1 %This file contains a walkthrough for explaining Gaussian Processes to
2 %any scientist, including statisticians and non statisticians.
3 %There is a .pdf file that completes this tutorial.
4
5 %% Demo #1 Univariate Normal %%
6 % in order to generate a univariate random number, use any real numbers \mu
7 % and \sigma to define the distribution.
8 mu = 6.2; %the mean
9 sigma_squared = 2; %sigma^2 is the variance
10
11 %randn returns a random number from the standard normal distribution, which
12 %is a univariate normal with \mu = 0, \sigma = 1
13 standard_normal_random_number = randn;
14
15 %this is converted to an arbitrary univariate gaussian normal by:
16 normal_random_number = sqrt(sigma_squared) * standard_normal_random_number + mu;
17
18 %now, we can plot what's going on:
19 for i=1:100000
20 standard_normal_random_number(i) = randn;
normal_random_number(i) = sqrt(sigma_squared) * standard_normal_random_number(i) +
21
mu;
22 end
23
24 hist([standard_normal_random_number' normal_random_number'],50)
25 legend('standard', 'custom')
26
27 %% Demo #2 Multivariate Normal %%
28 %The same trick works for the multivariate normal. \Mu has to be a vector,
29 %and \Sigma a symetric semidefinite matrix
30 Mu = [3 ;0]; %the mean
31
32 %a matrix is symetric iff M(a,b) = M(b,a), a positive semidefinite iff
33 % x*M*x'>=0 for any vector x. These properties are satisfied by all matrices
34 % which are taken from the matlab function gallery('randcorr',n)
35 Sigma = [ 1.0000 -0.9195; -0.9195 1.0000];
36
37 % to find A such that A'*A = Sigma, we find the eigendecompositon of Sigma:
38 % V'*D*V = Sigma
39 [V,D]=eig(Sigma);
40 A=V*(D.^(1/2));
41
42 %The standard multivariate normal is just a series of independently drawn
43 %univariate normal random numbers
44 standard_random_vector = randn(2,1);
45

1 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

46 %this is converted to an arbitrary multivariate normal by:


47 normal_random_vector = A * standard_random_vector + Mu;
48
49 %now, we can plot what's going on:
50 for i=1:1000
51 standard_random_vector(:,i) = randn(2,1);
52 normal_random_vector(:,i) = A * standard_random_vector(:,i) + Mu;
53 end
54 plot(standard_random_vector(1,:),standard_random_vector(2,:),'.')
55 hold on
56 plot(normal_random_vector(1,:),normal_random_vector(2,:),'r.')
57 legend('standard', 'custom')
58
59 %% A Kernel Matrix demonstration Brownian motion
60 n=40;
61 %The brownian motion kernel is defined through its very simple inverse
62 inverse=2*eye(n);
63 inverse=inverse-(triu(ones(n),1)-triu(ones(n),2))';
64 inverse=inverse-(triu(ones(n),1)-triu(ones(n),2));
65
66 subplot(1,3,1);imagesc(inverse);title('Brownian motion K^{-1}')
67 subplot(1,3,2);imagesc(inv(inverse));title('Brownian motion K')
68
69 %draw some samples
70 r=randn(n,10);
71 subplot(1,3,3);plot(inverse\r);title('10 samples')
72
73 %% Demo #3 Gaussian Process %%
74 % one way of looking at a Gaussian Process is as a Multivariate Normal with
75 % an infinite number of dimensions. However, in order to model
76 % relationships between points, we construct the covariance matrix with a
77 % function that defines the value of the matrix for any pair of real numbers:
78 sigma_f = 1.1251; %parameter of the squared exponential kernel
79 l = 0.90441; %parameter of the squared exponential kernel
80 kernel_function = @(x,x2) sigma_f^2*exp((x-x2)^2/(-2*l^2));
81
82 %This is one of many popular kernel functions, the squared exponential
83 %kernel. It favors smooth functions. (Here, it is defined here as an anonymous
84 %function handle)
85
86 % we can also define an error function, which models the observation noise
87 sigma_n = 0.1; %known noise on observed data
88 error_function = @(x,x2) sigma_n^2*(x==x2);
89 %this is just iid gaussian noise with mean 0 and variance sigma_n^2s
90
91 %kernel functions can be added together. Here, we add the error kernel to
92 %the squared exponential kernel)
93 k = @(x,x2) kernel_function(x,x2)+error_function(x,x2);
94

2 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

95 %We can find the mean and the variance of the GP at each point
96 prediction_x=-2:0.01:1;
97 for i=1:length(prediction_x)
98 mean(i) = 0;
99 variance(i) = k(prediction_x(i),prediction_x(i));
100 end
plot_variance = @(x,lower,upper,color) set(fill([x,x(end:-1:1)],
101
[upper,fliplr(lower)],color),'EdgeColor',color);
102 plot_variance(prediction_x,mean-1.96*variance,mean+1.96*variance,[0.9 0.9 0.9])
103 hold on
104 set(plot(prediction_x,mean,'k'),'LineWidth',2)
105
106 %% Demo #4 Gaussian Process Sampling %%
107 % now, we would like to sample from a Gaussian Process defined by this
108 % kernel. First, for the subset of points we are interested to plot, we
109 % construct the kernel matrix (using our kernel function)
110 K=zeros(length(prediction_x),length(prediction_x));
111 for i=1:length(prediction_x)

112 for j=i:length(prediction_x)%We only calculate the top half of the matrix. This is
an unnecessary speedup trick
113 K(i,j)=k(prediction_x(i),prediction_x(j));
114 end
115 end
116 K=K+triu(K,1)'; % We can use the upper half of the matrix and copy it to the
117 %bottom, because it is symetrical
118
119 [V,D]=eig(K);
120 A=V*(D.^(1/2));
121
122 %Then, we use the kernel as the covariance matrix of a multivariate normal
123 clear gaussian_process_sample;
124 for i=1:7
125 standard_random_vector = randn(length(prediction_x),1);
126 gaussian_process_sample(:,i) = A * standard_random_vector;
127 end
128
129 plot(prediction_x,real(gaussian_process_sample))
130
131 %% Demo #5 Gaussian Process Regression %%
132 %initialize observations
133 X_o = [-1.5 -1 -0.75 -0.4 -0.3 0]';
134 Y_o = [-1.6 -1.3 -0.5 0 0.3 0.6]';
135
136 K = zeros(length(X_o));
137 for i=1:length(X_o)

138 for j=1:length(X_o)


139 K(i,j)=k(X_o(i),X_o(j));
140 end
141 end

3 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

142
%note that we use kernel_function, not kernel_function+error_function, when
143
incorporating points other than noisy measurements
144 K_ss=zeros(length(prediction_x),length(prediction_x));
145 for i=1:length(prediction_x)

146 for j=i:length(prediction_x)%We only calculate the top half of the matrix. This an
unnecessary speedup trick
147 K_ss(i,j)=kernel_function(prediction_x(i),prediction_x(j));
148 end
149 end
150 K_ss=K_ss+triu(K_ss,1)'; % We can use the upper half of the matrix and copy it to the
151
152 K_s=zeros(length(prediction_x),length(X_o));
153 for i=1:length(prediction_x)

154 for j=1:length(X_o)


155 K_s(i,j)=kernel_function(prediction_x(i),X_o(j));
156 end
157 end
158
159 Mu = (K_s/K)*Y_o;
160 Sigma = 1.96*sqrt(diag(K_ss-K_s/K*K_s'));
161
162 figure
plot_variance = @(x,lower,upper,color) set(fill([x,x(end:-1:1)],
163
[upper,fliplr(lower)],color),'EdgeColor',color);
164 plot_variance(prediction_x,(Mu-Sigma)',(Mu+Sigma)',[0.8 0.8 0.8])
165 hold on
166 set(plot(prediction_x,Mu,'k-'),'LineWidth',2)
167 set(plot(X_o,Y_o,'r.'),'MarkerSize',15)
168
169 % this gives a poor model, because we aren't using good parameters to model
170 % the function. In order to get better parameters, we can maximize evidence
171 evidence = exp((Y_o'/K*Y_o+log(det(K))+length(Y_o)*log(2*pi))/-2);
172 title (['this plot has evidence ' num2str(evidence)])

173
174 legend('confidence bounds','mean','data points','location','SouthEast')
175
176 %% Demo #5.2 Sample from the Gaussian Process posterior
177 clearvars -except k prediction_x K X_o Y_o
178
179 %We can also sample from this posterior, the same way as we sampled before:
180 K_ss=zeros(length(prediction_x),length(prediction_x));
181 for i=1:length(prediction_x)

182 for j=i:length(prediction_x)%We only calculate the top half of the matrix. This an
unnecessary speedup trick
183 K_ss(i,j)=k(prediction_x(i),prediction_x(j));
184 end
185 end
186 K_ss=K_ss+triu(K_ss,1)'; % We can use the upper half of the matrix and copy it to the

4 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

187
188 K_s=zeros(length(prediction_x),length(X_o));
189 for i=1:length(prediction_x)

190 for j=1:length(X_o)


191 K_s(i,j)=k(prediction_x(i),X_o(j));
192 end
193 end
194
195 [V,D]=eig(K_ss-K_s/K*K_s');
196 A=real(V*(D.^(1/2)));
197

198 for i=1:7


199 standard_random_vector = randn(length(A),1);
200 gaussian_process_sample(:,i) = A * standard_random_vector+K_s/K*Y_o;
201 end
202 hold on
203 plot(prediction_x,real(gaussian_process_sample))
204
205 set(plot(X_o,Y_o,'r.'),'MarkerSize',20)
206
207 %% Demo 6 finding parameters of a Gaussian Process with grid search %%
208 sigma_n = 0.2;% we know the amount of noise from the data
209 sigma_range = 0.01:0.04:4;
210 l_range = 0.01:0.04:2;
211
212 evidence=zeros(length(sigma_range),length(l_range));
213 for i=1:length(sigma_range)
214 for j=1:length(l_range)
215 evidence(i,j)=evidence_2_param_GP([sigma_range(i) l_range(j)],sigma_n);
216 end
217 end
218 imagesc(sigma_range,l_range,real(evidence)');colorbar
219 title('inverse log evidence of the squared exponential kernel')
220 xlabel('sigma parameter')
221 ylabel('l parameter')
222 hold on
223 %get the max l and sigma
224 [v location]=min(evidence(:));
225 l_location=floor(location/length(sigma_range))+1;
226 sigma_location=mod(location,l_location*length(sigma_range)-length(sigma_range));
227 l = l_range(l_location);
228 sigma = sigma_range(sigma_location);
229
230 set(plot(sigma,l,'rx'),'MarkerSize',25)
231
232 % finding parameters of a Gaussian Process with fminsearch %%
233 starting_place = randn(2,10);
234 sigma_n = 0.2;

5 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

235 for i=1:size(starting_place,2)


236 x_guess(:,i)=fminsearch(@evidence_2_param_GP,starting_place(:,i),[],sigma_n);
237 error(i)=evidence_2_param_GP(x_guess(:,i),sigma_n);
238 end
239 [v location]=min(error);
240
disp(['The optimal evidence is '
241
num2str(exp(-evidence_2_param_GP(x_guess(:,location),sigma_n))) ','])

242 disp(['(for parameters \sigma=' num2str(x_guess(1,location)) ' and l='


num2str(x_guess(2,location)) ')'])
243 x_guess=abs(x_guess);
244 set(plot(x_guess(1,location),x_guess(2,location),'r.'),'MarkerSize',25)
245
246 legend('grid search minimum', 'matlab''s fminsearch minimum')
247
248 %% Demo 7 Gaussian Process with input in 2-D
249 clear
250 close all
251 sigma_f=0.5;
252 l=1;
253 kernel_function = @(x,x2) sigma_f^2*exp(((x-x2)'*(x-x2))/(-2*l^2));
254
255 grid_size=15;
256 [x1 x2]=meshgrid(1:grid_size,1:grid_size);
257 prediction_x=[x1(:)';x2(:)']./5;
258

259 for i=1:grid_size^2


260 for j=1:grid_size^2
261 K(i,j)=kernel_function(prediction_x(:,i),prediction_x(:,j));
262 end
263 end
264
265 [V,D]=eig(K);
266 A=V*(D.^(1/2));
267

268 for k=1:10


269 % prediction_x is a matrix where each collumn is a test location in x
270 standard_random_vector = randn(length(prediction_x),1);
271 gaussian_process_sample = A * standard_random_vector;
272 %plot3(prediction_x(1,:),prediction_x(2,:),real(gaussian_process_sample(:,i)));
273 %grid on
274 surf(x1,x2,reshape(real(gaussian_process_sample),grid_size,grid_size));
275 xlabel('x_1')
276 ylabel('x_2')
277 zlabel('y')
278 pause
279 end
280
281 %% Demo 8 (offline) Flying through a Guassian Process sample in 2-D

6 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

282 clear
283 close all
284 sigma_f=0.5;
285 l=1;
286 kernel_function = @(x,x2) sigma_f^2*exp(dot(x-x2,x-x2)/(-2*l^2));
287
288 grid_size=20;
289 animation_length=100;
290 tic
291 [x1 x2]=meshgrid(1:grid_size,1:grid_size+animation_length);
292 toc
293 % prediction_x is a matrix where each collumn is a test location in x
294 prediction_x=[x1(:)';x2(:)']'./5;
295 % for i=1:length(prediction_x)
296 % for j=1:length(prediction_x)
297 % K(i,j)=kernel_function(prediction_x(:,i),prediction_x(:,j));
298 % end
299 % end
300 %new version
301 tic
302 n=size(prediction_x,1);
303 K=prediction_x*prediction_x'/sigma_f^2;
304 d=diag(K);
305 K=K-ones(n,1)*d'/2;
306 K=K-d*ones(1,n)/2;
307 K=exp(K);
308 toc
309
310 [V,D]=eig(K);
311 A=V*(D.^(1/2));
312
313 standard_random_vector = randn(length(prediction_x),1);
314 gaussian_process_sample = real(A * standard_random_vector)./3;
315
316 sample_rect=reshape(real(gaussian_process_sample),grid_size+animation_length,grid_size);
317 i=1;
318 surf(x1(i:i+grid_size-1,:),x2(i:i+grid_size-1,:),sample_rect(i:i+grid_size-1,:));
319

320 for i=1:animation_length


321 %plot3(prediction_x(1,:),prediction_x(2,:),real(gaussian_process_sample(:,i)));
322 %grid on
323 surf(x1(i:i+grid_size-1,:),x2(i:i+grid_size-1,:),sample_rect(i:i+grid_size-1,:));
324 axis([0 20 i i+grid_size -1 1])
325 xlabel('x_1')
326 ylabel('x_2')
327 zlabel('y')
328 pause(0.1)
329 end
330
331 %% Demo 9 (online) Flying through a Guassian Process sample in 2-D

7 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

332 clear
333 close all
334 sigma_f=7;
335
336 grid_size=50;
337 [x1 x2]=meshgrid(1:grid_size);
338 x1_place=grid_size;
339 % prediction_x is a matrix where each collumn is a test location in x
340 prediction_x=[x1(:)';x2(:)']';
341 % for i=1:length(prediction_x)
342 % for j=1:length(prediction_x)
343 % K(i,j)=kernel_function(prediction_x(:,i),prediction_x(:,j));
344 % end
345 % end
346 %new version
347 K=rbf(prediction_x,sigma_f);
348 [V,D]=eig(K);
349 A=V*(D.^(1/2));
350
351 standard_random_vector = randn(length(prediction_x),1);
352 gaussian_process_sample = real(A * standard_random_vector);
353
354 sample_rect=reshape(real(gaussian_process_sample),grid_size,grid_size);
355 surf(x1,x2,sample_rect);
356 i=1;
357 axis([1 grid_size i i+grid_size-1 -2 2])
358 xlabel('x_1')
359 ylabel('x_2')
360 zlabel('y')
361

362 for i=1:2000


363 %generate next x coordinates
364 x1_place = x1_place + 1;
365 [x1_next x2_next]=meshgrid(x1_place,1:grid_size);
366 x1_old=x1(:,end-2:end);
367 x2_old=x2(:,end-2:end);
368 x1=[x1(:,2:end) x1_next];
369 x2=[x2(:,2:end) x2_next];
370 previous_x=[x1_old(:) x2_old(:)];
371 new_x=[x1_next(:) x2_next(:)];
372
373 %generate y for these coordinates, given already sampled y. To reduce
374 %complexity, we only look at the y of the previous frame, not all observed y
375 tic
376 K=rbf(previous_x,sigma_f)+eye(length(previous_x))*10^-5;
377 K_ss=rbf(new_x,sigma_f)+eye(length(new_x))*10^-5;
378 K_s=rbf(new_x,sigma_f,previous_x);
379
380 [V,D]=eig(K_ss-K_s/K*K_s');
381 A=real(V*(D.^(1/2)));

8 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

382

383
384 standard_random_vector = randn(length(K_ss),1);
385 %gaussian_process_sample
386 %real(gaussian_process_sample(grid_size+1:end))
new_sample = A * standard_random_vector+K_s/K*real(gaussian_process_sample(end-
387
length(previous_x)+1:end));
gaussian_process_sample =
388
[gaussian_process_sample(length(new_sample)+1:end);new_sample];
389 toc
390 sample_rect=reshape(real(gaussian_process_sample),grid_size,grid_size);
391
392 surf(x1,x2,sample_rect);
393 axis([i i+grid_size-1 1 grid_size -2 2])
394 xlabel('x_1')
395 ylabel('x_2')
396 zlabel('y')
397
398 pause(0.05)
399 end
400
401 %% Demo 10 Sampling the Gaussian Process prior in 3D
402 %the previously used kernel_function and error_function can be extended to
403 %support vector input. This is an natural case of the squared exponential
404 %kernel, with the same parameters and properties. (note that this is the
405 %rotationally invariant version)
406 kernel_function_m = @(x,x2) sigma_f^2*exp((x-x2)'*(x-x2)/(-2*l^2));
407 %here, the error function needs to test whether two vectors are exactly the
408 %same. This is done by counting the number of matches between x and x2
409 error_function_m = @(x,x2) sigma_n^2*(sum(x==x2)==length(x));
410 %k_m is used instead of k
411 k_m = @(x,x2) kernel_function_m(x,x2)+error_function_m(x,x2);
412
413 %the resolution option here allows to change the granularity of the sampled
414 %Gaussian Process. Since resolution^2 samples need to be taken, this cannot
415 %be too high. In even higher dimensions, taking samples from a Gaussian
416 %Process at every point of a grid becomes prohibitive, so they cannot be
417 %visualised by their samples.
418 resolution=5;
419 %generate the grid where to take samples, and save it the same way as
420 %prediciton_x in previous Demos.
421 [a b]=meshgrid(linspace(0,1,resolution));
422 %Before, prediciton_x had n*1 dimensions, now it will have n*2
423 prediction_x=[a(:) b(:)]';
424
425 %This is done exactly as in Demo 4
426 K=zeros(size(prediction_x,2),size(prediction_x,2));
427 for i=1:size(prediction_x,2)

428 for j=i:size(prediction_x,2)%We only calculate the top half of the matrix.
429 %(This is an unnecessary speedup trick)

9 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

430
431 %here, we use the vectors prediction_x(:,i) for each point
432 K(i,j)=k_m(prediction_x(:,i),prediction_x(:,j));
433 end
434 end
435 K=K+triu(K,1)'; % We can use the upper half of the matrix and copy it to the
436 %bottom, because it is symetrical
437
438 [V,D]=eig(K);
439 A=V*(D.^(1/2));
440
441 %take a single sample, and plot the surface. Note that this becomes
442 %impossible in higher dimensions.
443 standard_random_vector = randn(size(prediction_x,2),1);
444 sample=A * standard_random_vector;
445 surf(reshape(sample,resolution,resolution))
446
447 %an important note: a Gaussian Process with 1D input is a continuous line, but for a
448 %2D input, it is a continuous surface. That's because the Gaussian Process defines a
449 %value for every possible point everywhere in the x-y space.
450
451 %% Demo 11 Sampling the Gaussian Process posterior in many dimensions
452 dimensions=4;
453 %this initialises the random number generator to create the same numbers
454 %each time it is executed, for repeatability.
455 rng('default');
456 %generate 7 random observations in 4 dimensions
457 X_o = rand(dimensions,7);
458 Y_o = [-2 -1.6 -1.3 -0.5 0 0.3 0.6]';
459
460 K = zeros(size(X_o,2));
461 for i=1:size(X_o,2)

462 for j=1:size(X_o,2)


463 K(i,j)=k_m(X_o(:,i),X_o(:,j));
464 end
465 end
466
467 prediction_x=[0 0 0 0; 1 0 0 0; 1 1 1 1]';
468
469 K_ss=zeros(size(prediction_x,2));
470 for i=1:size(prediction_x,2)

471 for j=i:size(prediction_x,2)%We only calculate the top half of the matrix. This an
unnecessary speedup trick
472 K_ss(i,j)=k_m(prediction_x(:,i),prediction_x(:,j));
473 end
474 end
475 K_ss=K_ss+triu(K_ss,1)'; % We can use the upper half of the matrix and copy it to the
476
477 K_s=zeros(size(prediction_x,2),size(X_o,2));

10 of 11 22/12/2015 23:45
http://mrmartin.net/?p=223

478 for i=1:size(prediction_x,2)

479 for j=1:size(X_o,2)%We only calculate the top half of the matrix. This an
unnecessary speedup trick
480 K_s(i,j)=k_m(prediction_x(:,i),X_o(:,j));
481 end
482 end
483
484 %calculate Mu and Sigma according to the equation on page 13
485 Mu = (K_s/K)*Y_o;
486 Sigma = K_ss-K_s/K*K_s';

11 of 11 22/12/2015 23:45

Вам также может понравиться