R : Copyright 2002, The R Development Core Team Version 1.5.1 (2002-06-17) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type `license()' or `licence()' for distribution details. R is a collaborative project with many contributors. Type `contributors()' for more information. Type `demo()' for some demos, `help()' for on-line help, or `help.start()' for a HTML browser interface to help. Type `q()' to quit R. > invisible(options(echo = TRUE)) > library(rpart) > > target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric") > target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric") > target.tst <- read.table("../tst/num/472.dat",header=T,colClasses="numeric") > > > y.lrn <- target.lrn[,1] > y.val <- target.val[,1] > y.tst <- target.tst[,1] > y <- c(y.lrn,y.val,y.tst) > > n.lrn <- length(y.lrn) > n.val <- length(y.val) > n.tst <- length(y.tst) > n <- length(y) > > rm(target.lrn,target.val,target.tst) > > wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1 > idx.lrn <- 1:n.lrn > idx.val <- (n.lrn+1):(n.lrn+n.val) > idx.tst <- (n.lrn+n.val+1):n > > mod <- read.table("../cty_mod.txt", + header=F,colClasses="character",col.names=c("file","feature","type")) > > n.mod <- length(mod$file) > > first.time <- TRUE > > for (i in 1:n.mod) { + + fn.lrn <- paste("../lrn/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.val <- paste("../val/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.tst <- paste("../tst/",mod$type[i],"/",mod$file[i],".dat",sep="") + print(mod$feature[i]) + + if (mod$type[i]=="chr") { + + f.lrn <- read.table(fn.lrn, + header=T,colClasses="character",blank.lines.skip=F) + f.val <- read.table(fn.val, + header=T,colClasses="character",blank.lines.skip=F) + f.tst <- read.table(fn.tst, + header=T,colClasses="character",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + if (mod$feature[i]=="STATE") { + f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1" + f[f=="OH"|f=="RI"|f=="VI"|f=="WV"] <- "S1" + f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2" + f[f=="NJ"|f=="NY"|f=="PA"|f=="PA"|f=="VA"|f=="VT"] <- "S2" + f[f=="WY"] <- "S2" + f[f=="AK"|f=="UT"|f=="MS"] <- "S3" + f[f=="NE"|f=="ND"] <- "S4" + f[f=="SD"|f=="SC"] <- "S5" + } + + f <- as.factor(f) + + n.lev <- nlevels(f) + f.name <- levels(f) + if(n.lev==2) f.name <- c(mod$feature[i],mod$feature[i]) + print(paste(" nlevels = ",n.lev)) + + f <- model.matrix(y ~ f - 1) # Note: Intercept removed. + f <- f[,2:n.lev] # Note: First dummy deleted. + f.name <- f.name[2:n.lev] # Note: First name deleted. + + } else { + + f.lrn<-read.table(fn.lrn, + header=T,colClasses="numeric",blank.lines.skip=F) + f.val<-read.table(fn.val, + header=T,colClasses="numeric",blank.lines.skip=F) + f.tst<-read.table(fn.tst, + header=T,colClasses="numeric",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + f[is.na(f)] <- 0 + + f.name <- mod$feature[i] + + } + + if (first.time) { + X <- f + X.names <- f.name + first.time <- FALSE + } else { + X <- cbind(prev.X,f) + X.names <- c(prev.X.names,f.name) + } + + prev.X <- X + prev.X.names <- X.names + + } [1] "LASTGIFT" [1] "PEPSTRFL" [1] " nlevels = 2" [1] "STATE" [1] " nlevels = 33" [1] "RECP3" [1] " nlevels = 2" [1] "DOB" [1] "MAILCODE" [1] " nlevels = 2" [1] "MHUC2" [1] "LASTDATE" [1] "MINRAMNT" > > rm(prev.X,prev.X.names) > rm(f.lrn,f.val,f) > > dimnames(X) <- list(NULL,X.names) > > ctrl <- rpart.control(minsplit=90,maxsurrogate=0,maxdepth=15,cp=0.0008) > tree <- rpart(y~X,weights=wts,method="anova",control=ctrl) > > source("psopts.r") > postscript(file="cty_tree_0008.eps") > par.default <- par(no.readonly=TRUE) > par(mar=c(0,0,0,0)) > plot(tree) > text(tree) > par(par.default) > dev.off() null device 1 > > print(ctrl) $minsplit [1] 90 $minbucket [1] 30 $cp [1] 8e-04 $maxcompete [1] 4 $maxsurrogate [1] 0 $usesurrogate [1] 2 $surrogatestyle [1] 0 $maxdepth [1] 15 $xval [1] 10 > print(tree$frame) var n wt dev yval complexity ncompete 1 X.LASTGIFT 95412 66900 1344637.77 0.7887979 0.0034462960 4 2 X.LASTGIFT 94735 66431 1163123.67 0.7666839 0.0010147590 4 4 87823 61590 861271.89 0.7265038 0.0006310128 0 5 6912 4841 300487.30 1.2778785 0.0006849709 0 3 X.PEPSTRFL 677 469 176880.08 3.9211087 0.0031555503 4 6 X.MHUC2 428 296 37621.62 1.6216216 0.0009874031 4 12 385 275 19079.64 1.0363636 0.0004344081 0 13 43 21 17214.29 9.2857143 0.0008000000 0 7 X.LASTDATE 249 173 135015.39 7.8554913 0.0027381281 4 14 X.DOB 210 148 74405.76 5.9594595 0.0010739394 4 28 X.DOB 179 124 50452.09 4.6532258 0.0010739394 4 56 74 46 0.00 0.0000000 0.0008000000 0 57 X.LASTDATE 105 78 48868.68 7.3974359 0.0009034007 3 114 50 37 10010.81 3.2432432 0.0008000000 0 115 55 41 37643.12 11.1463415 0.0008000000 0 29 31 24 22648.96 12.7083333 0.0008000000 0 15 39 25 56927.84 19.0800000 0.0008000000 0 nsurrogate 1 0 2 0 4 0 5 0 3 0 6 0 12 0 13 0 7 0 14 0 28 0 56 0 57 0 114 0 115 0 29 0 15 0 > summary.rpart(tree,file="cty_tree_0008.out") > > yhat <- tree$frame[tree$where,5] > ehat <- y - yhat > mse.lrn <- sum(ehat[idx.lrn]^2)/n.lrn > mse.val <- sum(ehat[idx.val]^2)/n.val > mse.tst <- sum(ehat[idx.tst]^2)/n.tst > > print(paste(" mse.lrn = ",mse.lrn)) [1] " mse.lrn = 19.8099228572865" > print(paste(" mse.val = ",mse.val)) [1] " mse.val = 18.8311786562636" > print(paste(" mse.tst = ",mse.tst)) [1] " mse.tst = 18.3128051798462" > > outvec <- yhat[idx.lrn] > outlab <- "yhat.lrn" > outfil <- "yhat_lrn_0008.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- yhat[idx.val] > outlab <- "yhat.val" > outfil <- "yhat_val_0008.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- yhat[idx.tst] > outlab <- "yhat.tst" > outfil <- "yhat_tst_0008.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > rm(outvec,outlab,outfil) > > x0 <- c(0,n.lrn) > y0 <- c(0,sum(y.lrn-0.68)) > > x1 <- (1:n.lrn) > y1 <- yhat[1:n.lrn] > y1 <- y1-0.68 > y1 <- sort(y1) > y1 <- y1[n.lrn:1] > y1 <- cumsum(y1) > > idx <- 1:n.lrn > > print(paste("maximum profit in learning sample is ", max(y1))) [1] "maximum profit in learning sample is 7309.86" > print(paste("maximum occurs at ", idx[y1==max(y1)])) [1] "maximum occurs at 66854" > > idx <- seq(1,n.lrn,length=200) > > x1 <- x1[idx] > y1 <- y1[idx] > > source("psopts.r"); > postscript(file="cty_lif_0008.eps"); > > plot(x=c(x0,x1),y=c(y0,y1),ylab="dollars",xlab="size of mailing",type="n") > lines(x=x0,y=y0,col="green") > lines(x=x1,y=y1,col="red") > > dev.off() null device 1 > proc.time() [1] 1286.91 10.05 1299.04 0.00 0.00 >