R : Copyright 2002, The R Development Core Team
Version 1.5.1  (2002-06-17)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type `license()' or `licence()' for distribution details.

R is a collaborative project with many contributors.
Type `contributors()' for more information.

Type `demo()' for some demos, `help()' for on-line help, or
`help.start()' for a HTML browser interface to help.
Type `q()' to quit R.

> invisible(options(echo = TRUE))
> library(rpart)
> 
> target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric")
> target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric")
> target.tst <- read.table("../tst/num/472.dat",header=T,colClasses="numeric")
> 
>  
> y.lrn <- target.lrn[,1]
> y.val <- target.val[,1]
> y.tst <- target.tst[,1]
> y     <- c(y.lrn,y.val,y.tst)
> 
> n.lrn <- length(y.lrn)
> n.val <- length(y.val)
> n.tst <- length(y.tst)
> n     <- length(y)
> 
> rm(target.lrn,target.val,target.tst)
> 
> wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1
> idx.lrn <- 1:n.lrn
> idx.val <- (n.lrn+1):(n.lrn+n.val)
> idx.tst <- (n.lrn+n.val+1):n
> 
> mod <- read.table("../cty_mod.txt",
+          header=F,colClasses="character",col.names=c("file","feature","type"))
> 
> n.mod <- length(mod$file)
> 
> first.time <- TRUE
> 
> for (i in 1:n.mod) {
+ 
+   fn.lrn <- paste("../lrn/",mod$type[i],"/",mod$file[i],".dat",sep="")
+   fn.val <- paste("../val/",mod$type[i],"/",mod$file[i],".dat",sep="")
+   fn.tst <- paste("../tst/",mod$type[i],"/",mod$file[i],".dat",sep="")
+   print(mod$feature[i])
+ 
+   if (mod$type[i]=="chr") {
+ 
+     f.lrn <- read.table(fn.lrn,
+                header=T,colClasses="character",blank.lines.skip=F)
+     f.val <- read.table(fn.val,
+                header=T,colClasses="character",blank.lines.skip=F)
+     f.tst <- read.table(fn.tst,
+                header=T,colClasses="character",blank.lines.skip=F)
+ 
+     f <- c(f.lrn[,1],f.val[,1],f.tst[,1])
+ 
+     if (mod$feature[i]=="STATE") {
+       f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1"
+       f[f=="OH"|f=="RI"|f=="VI"|f=="WV"]                 <- "S1"
+       f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2"
+       f[f=="NJ"|f=="NY"|f=="PA"|f=="PA"|f=="VA"|f=="VT"] <- "S2"
+       f[f=="WY"]                                         <- "S2"  
+       f[f=="AK"|f=="UT"|f=="MS"]                         <- "S3"
+       f[f=="NE"|f=="ND"]                                 <- "S4"
+       f[f=="SD"|f=="SC"]                                 <- "S5"
+     } 
+     
+     f <- as.factor(f)
+ 
+     n.lev <- nlevels(f)
+     f.name <- levels(f) 
+     if(n.lev==2) f.name <- c(mod$feature[i],mod$feature[i])
+     print(paste("  nlevels = ",n.lev))
+ 
+     f <- model.matrix(y ~ f - 1)      # Note: Intercept removed.
+     f <- f[,2:n.lev]                  # Note: First dummy deleted.
+     f.name <- f.name[2:n.lev]         # Note: First name deleted.
+ 
+   } else {
+     
+     f.lrn<-read.table(fn.lrn,
+              header=T,colClasses="numeric",blank.lines.skip=F)
+     f.val<-read.table(fn.val,
+              header=T,colClasses="numeric",blank.lines.skip=F)
+     f.tst<-read.table(fn.tst,
+              header=T,colClasses="numeric",blank.lines.skip=F)
+ 
+     f <- c(f.lrn[,1],f.val[,1],f.tst[,1])
+ 
+     f[is.na(f)] <- 0
+ 
+     f.name <- mod$feature[i]
+ 
+   }
+ 
+   if (first.time) {
+     X <- f
+     X.names <- f.name
+     first.time <- FALSE
+   } else {
+     X <- cbind(prev.X,f)
+     X.names <- c(prev.X.names,f.name)
+   }
+ 
+   prev.X <- X
+   prev.X.names <- X.names
+ 
+ }
[1] "LASTGIFT"
[1] "PEPSTRFL"
[1] "  nlevels =  2"
[1] "STATE"
[1] "  nlevels =  33"
[1] "RECP3"
[1] "  nlevels =  2"
[1] "DOB"
[1] "MAILCODE"
[1] "  nlevels =  2"
[1] "MHUC2"
[1] "LASTDATE"
[1] "MINRAMNT"
> 
> rm(prev.X,prev.X.names)
> rm(f.lrn,f.val,f)
> 
> dimnames(X) <- list(NULL,X.names)
> 
> ctrl <- rpart.control(minsplit=90,maxsurrogate=0,maxdepth=15,cp=0.0008) 
> tree <- rpart(y~X,weights=wts,method="anova",control=ctrl) 
> 
> source("psopts.r")
> postscript(file="cty_tree_0008.eps")
> par.default <- par(no.readonly=TRUE)
> par(mar=c(0,0,0,0))
> plot(tree)
> text(tree) 
> par(par.default)
> dev.off()
null device 
          1 
>   
> print(ctrl)
$minsplit
[1] 90

$minbucket
[1] 30

$cp
[1] 8e-04

$maxcompete
[1] 4

$maxsurrogate
[1] 0

$usesurrogate
[1] 2

$surrogatestyle
[1] 0

$maxdepth
[1] 15

$xval
[1] 10

> print(tree$frame)
           var     n    wt        dev       yval   complexity ncompete
1   X.LASTGIFT 95412 66900 1344637.77  0.7887979 0.0034462960        4
2   X.LASTGIFT 94735 66431 1163123.67  0.7666839 0.0010147590        4
4       <leaf> 87823 61590  861271.89  0.7265038 0.0006310128        0
5       <leaf>  6912  4841  300487.30  1.2778785 0.0006849709        0
3   X.PEPSTRFL   677   469  176880.08  3.9211087 0.0031555503        4
6      X.MHUC2   428   296   37621.62  1.6216216 0.0009874031        4
12      <leaf>   385   275   19079.64  1.0363636 0.0004344081        0
13      <leaf>    43    21   17214.29  9.2857143 0.0008000000        0
7   X.LASTDATE   249   173  135015.39  7.8554913 0.0027381281        4
14       X.DOB   210   148   74405.76  5.9594595 0.0010739394        4
28       X.DOB   179   124   50452.09  4.6532258 0.0010739394        4
56      <leaf>    74    46       0.00  0.0000000 0.0008000000        0
57  X.LASTDATE   105    78   48868.68  7.3974359 0.0009034007        3
114     <leaf>    50    37   10010.81  3.2432432 0.0008000000        0
115     <leaf>    55    41   37643.12 11.1463415 0.0008000000        0
29      <leaf>    31    24   22648.96 12.7083333 0.0008000000        0
15      <leaf>    39    25   56927.84 19.0800000 0.0008000000        0
    nsurrogate
1            0
2            0
4            0
5            0
3            0
6            0
12           0
13           0
7            0
14           0
28           0
56           0
57           0
114          0
115          0
29           0
15           0
> summary.rpart(tree,file="cty_tree_0008.out")
> 
> yhat    <- tree$frame[tree$where,5]
> ehat    <- y - yhat
> mse.lrn <-  sum(ehat[idx.lrn]^2)/n.lrn 
> mse.val <-  sum(ehat[idx.val]^2)/n.val
> mse.tst <-  sum(ehat[idx.tst]^2)/n.tst
> 
> print(paste("  mse.lrn = ",mse.lrn))
[1] "  mse.lrn =  19.8099228572865"
> print(paste("  mse.val = ",mse.val))
[1] "  mse.val =  18.8311786562636"
> print(paste("  mse.tst = ",mse.tst))
[1] "  mse.tst =  18.3128051798462"
> 
> outvec <- yhat[idx.lrn]
> outlab <- "yhat.lrn"
> outfil <- "yhat_lrn_0008.dat"
> write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil)
> 
> outvec <- yhat[idx.val]
> outlab <- "yhat.val"
> outfil <- "yhat_val_0008.dat"
> write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil)
> 
> outvec <- yhat[idx.tst]
> outlab <- "yhat.tst"
> outfil <- "yhat_tst_0008.dat"
> write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil)
> 
> rm(outvec,outlab,outfil)
> 
> x0 <- c(0,n.lrn)
> y0 <- c(0,sum(y.lrn-0.68))
> 
> x1 <- (1:n.lrn)
> y1 <- yhat[1:n.lrn]
> y1 <- y1-0.68
> y1 <- sort(y1)
> y1 <- y1[n.lrn:1]
> y1 <- cumsum(y1)
> 
> idx <- 1:n.lrn
> 
> print(paste("maximum profit in learning sample is ", max(y1)))
[1] "maximum profit in learning sample is  7309.86"
> print(paste("maximum occurs at ", idx[y1==max(y1)]))
[1] "maximum occurs at  66854"
> 
> idx <- seq(1,n.lrn,length=200)
> 
> x1 <- x1[idx]
> y1 <- y1[idx]
> 
> source("psopts.r");
> postscript(file="cty_lif_0008.eps");
> 
> plot(x=c(x0,x1),y=c(y0,y1),ylab="dollars",xlab="size of mailing",type="n")
> lines(x=x0,y=y0,col="green")
> lines(x=x1,y=y1,col="red")
> 
> dev.off()
null device 
          1 
> proc.time()
[1] 1286.91   10.05 1299.04    0.00    0.00
>