### Gamma Ray Data kernreg = function(y,x,h,newx){ ### kernel regression n = nrow(x) d = ncol(x) m = nrow(newx) f = rep(0,m) for(i in 1:m){ tmp = sqrt(apply((matrix(newx[i,],n,d,byrow=TRUE) - x)^2,1,sum)) w = exp(-tmp^2/(2*h^2)) f[i] = sum(w*y)/sum(w) } return(f) } Cv = function(y,x,H){ ### cross validation huge = mean(y^2)*10 n = nrow(x) d = ncol(x) m = length(H) cv = rep(0,m) for(j in 1:m){ h = H[j] for(i in 1:n){ tmp = sqrt(apply((matrix(x[i,],n,d,byrow=TRUE) - x)^2,1,sum)) w = exp(-tmp^2/(2*h^2)) w = w/sum(w) f = sum(w*y) if(w[i] == 1)cv[j] = cv[j] + huge if(w[i] < 1)cv[j] = cv[j] + ((y[i] - f)/(1-w[i]))^2 } } cv = cv/n return(cv) } Kernreg = function(y,x,H,newx){ ### kernel regression metafunction ### applies cross validation then fits regression using ### best bandwidth cv = Cv(y,x,H) h = H[which.min(cv)] f = kernreg(y,x,h,newx) return(list(f=f,h=h,cv=cv)) } hat = function(p){ ### convert p = P(Y=1|X=x) to hard classifications n = length(p) y = rep(1,n) y[p < .5] = 0 return(y) } ###### Get the data d = read.table("magic04.data",sep=",",header=FALSE) n = nrow(d) a = d[,11] y = rep(1,n) y[a=="h"]=0 x = d[,1:10] x = as.matrix(x) x = scale(x) ### extract training data I1 = (1:n)[y == 1] I0 = (1:n)[y == 0] k = 200 J1 = sample(I1,size=k,replace=FALSE) J0 = sample(I0,size=k,replace=FALSE) xtrain = x[c(J1,J0),] ytrain = y[c(J1,J0)] training.data = data.frame(x=xtrain,y=ytrain) ntrain = nrow(xtrain) names(training.data) = c(paste("x",1:10,sep=""),"y") ### extract test data I = setdiff(1:n,c(J1,J0)) I1 = I[y[I] == 1] I0 = I[y[I] == 0] k = 500 J1 = sample(I1,size=k,replace=FALSE) J0 = sample(I0,size=k,replace=FALSE) xtest = x[c(J1,J0),] ytest = y[c(J1,J0)] test.data = data.frame(x=xtest,y=ytest) ntest = nrow(xtest) names(test.data) = c(paste("x",1:10,sep=""),"y") ### logistic regression out = glm(y ~ .,family="binomial",data=training.data) p = predict(out,type="response") yhat = hat(p) error = mean(ytrain != yhat) cat("Logistic Regression: Training Error: ",error,"\n") p = predict(out,type="response",newdata=test.data) yhat = hat(p) error = mean(ytest != yhat) cat("Logistic Regression: Test Error: ",error,"\n") ### svm library(kernlab) out = ksvm(xtrain,as.factor(ytrain),kernel="rbfdot",kpar=list(sigma=0.05)) tmp = predict(out,xtest) mean(tmp != ytest) postscript("gammaray.kernel.ps") ### kernel regression H = seq(.4,2,length=50) out = Kernreg(ytrain,xtrain,H,xtest) tmp = kernreg(ytrain,xtrain,out$h,xtrain) yhat = hat(tmp$f) error = mean(ytrain != yhat) cat("Kernel Regression: Training Error: ",error,"\n") yhat = hat(out$f) error = mean(ytest != yhat) cat("Kernel Regression: Test Error: ",error,"\n") plot(H,out$cv,type="l",lwd=3,xlab="Bandwidth", ylab="Cross-validation") abline(v=out$h,col="red") dev.off() postscript("gammaray.additive.ps") #### additive model library(gam) out = gam(y ~ lo(x1) + lo(x2) + lo(x3) + lo(x4) + lo(x5) + lo(x6) + lo(x7) + lo(x8) + lo(x9) + lo(x10),data=training.data) print(summary(out)) yhat = hat(predict(out)) error = mean(ytrain != yhat) cat("Additive Regression: Training Error: ",error,"\n") yhat = hat(predict(out,newdata=test.data)) error = mean(ytest != yhat) cat("Additive Regression: Test Error: ",error,"\n") par(mfrow=c(2,5)) plot(out,lwd=3,xlab="",ylab="") dev.off() out = gam(y ~ lo(x2) + lo(x3) + lo(x9),data=training.data) print(summary(out)) yhat = hat(predict(out)) error = mean(ytrain != yhat) cat("Additive Regression: Training Error: ",error,"\n") yhat = hat(predict(out,newdata=test.data)) error = mean(ytest != yhat) cat("Additive Regression: Test Error: ",error,"\n") ### knn postscript("gammaray.knn.ps") library(class) par(mfrow=c(1,1)) kmax = 50 error = rep(0,kmax) for(i in 1:kmax){ out=knn(xtrain,xtest,ytrain,k=i) error[i] = mean(out != ytest) } plot(1:kmax,error,type="l",lwd=3,xlab="k", ylab="Test Error") dev.off() ### trees postscript("gammaray.trees.ps") library(tree) out = tree(as.factor(ytrain) ~ xtrain,data=training.data) print(summary(out)) plot(out,type="u",lwd=3) text(out) dev.off() postscript("gammaray.tree2.ps") cv = cv.tree(out,method="misclass") plot(cv,lwd=3) dev.off() postscript("gammaray.tree3.ps") plot(cv,lwd=3) newtree = prune.tree(out,best=5,method="misclass") print(summary(newtree)) plot(newtree,lwd=3) text(newtree,cex=2) dev.off()