R : Copyright 2002, The R Development Core Team Version 1.5.1 (2002-06-17) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type `license()' or `licence()' for distribution details. R is a collaborative project with many contributors. Type `contributors()' for more information. Type `demo()' for some demos, `help()' for on-line help, or `help.start()' for a HTML browser interface to help. Type `q()' to quit R. > invisible(options(echo = TRUE)) > library(rpart) > > target.lrn <- read.table("../lrn/num/472.dat",header=T,colClasses="numeric") > target.val <- read.table("../val/num/472.dat",header=T,colClasses="numeric") > target.tst <- read.table("../tst/num/472.dat",header=T,colClasses="numeric") > > > y.lrn <- target.lrn[,1] > y.val <- target.val[,1] > y.tst <- target.tst[,1] > y <- c(y.lrn,y.val,y.tst) > > n.lrn <- length(y.lrn) > n.val <- length(y.val) > n.tst <- length(y.tst) > n <- length(y) > > rm(target.lrn,target.val,target.tst) > > wts <- mat.or.vec(n,1) ; for (i in 1:n.lrn) wts[i]=1 > idx.lrn <- 1:n.lrn > idx.val <- (n.lrn+1):(n.lrn+n.val) > idx.tst <- (n.lrn+n.val+1):n > > mod <- read.table("../cty_mod.txt", + header=F,colClasses="character",col.names=c("file","feature","type")) > > n.mod <- length(mod$file) > > first.time <- TRUE > > for (i in 1:n.mod) { + + fn.lrn <- paste("../lrn/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.val <- paste("../val/",mod$type[i],"/",mod$file[i],".dat",sep="") + fn.tst <- paste("../tst/",mod$type[i],"/",mod$file[i],".dat",sep="") + print(mod$feature[i]) + + if (mod$type[i]=="chr") { + + f.lrn <- read.table(fn.lrn, + header=T,colClasses="character",blank.lines.skip=F) + f.val <- read.table(fn.val, + header=T,colClasses="character",blank.lines.skip=F) + f.tst <- read.table(fn.tst, + header=T,colClasses="character",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + if (mod$feature[i]=="STATE") { + f[f=="AS"|f=="DC"|f=="DE"|f=="MA"|f=="ME"|f=="NH"] <- "S1" + f[f=="OH"|f=="RI"|f=="VI"|f=="WV"] <- "S1" + f[f=="AA"|f=="AE"|f=="AP"|f=="CT"|f=="GU"|f=="MD"] <- "S2" + f[f=="NJ"|f=="NY"|f=="PA"|f=="PA"|f=="VA"|f=="VT"] <- "S2" + f[f=="WY"] <- "S2" + f[f=="AK"|f=="UT"|f=="MS"] <- "S3" + f[f=="NE"|f=="ND"] <- "S4" + f[f=="SD"|f=="SC"] <- "S5" + f[f=="AL"|f=="HI"|f=="WI"|f=="NC"|f=="OK"|f=="GA"] <- "00" + f[f=="MN"|f=="S2"|f=="TN"|f=="ID"|f=="IN"|f=="IL"] <- "00" + f[f=="M0"|f=="AR"] <- "00" + } + + f <- as.factor(f) + + n.lev <- nlevels(f) + f.name <- levels(f) + if(n.lev==2) f.name <- c(mod$feature[i],mod$feature[i]) + print(paste(" nlevels = ",n.lev)) + + f <- model.matrix(y ~ f - 1) # Note: Intercept removed. + f <- f[,2:n.lev] # Note: First dummy deleted. + f.name <- f.name[2:n.lev] # Note: First name deleted. + + } else { + + f.lrn<-read.table(fn.lrn, + header=T,colClasses="numeric",blank.lines.skip=F) + f.val<-read.table(fn.val, + header=T,colClasses="numeric",blank.lines.skip=F) + f.tst<-read.table(fn.tst, + header=T,colClasses="numeric",blank.lines.skip=F) + + f <- c(f.lrn[,1],f.val[,1],f.tst[,1]) + + f[is.na(f)] <- 0 + + f.name <- mod$feature[i] + + if (mod$feature[i]=="DOB") { + d <- f ; d[d>0] <- 1 # Note: Dummy for missing DOB + f <- cbind(d,f,f^2) # Note: Quadratic term added to DOB. + rm(d) + f.name <- c("DOB_miss","DOB_linr","DOB_quad") + } + + } + + if (first.time) { + X <- f + X.names <- f.name + first.time <- FALSE + } else { + X <- cbind(prev.X,f) + X.names <- c(prev.X.names,f.name) + } + + prev.X <- X + prev.X.names <- X.names + + } [1] "LASTGIFT" [1] "PEPSTRFL" [1] " nlevels = 2" [1] "STATE" [1] " nlevels = 21" [1] "RECP3" [1] " nlevels = 2" [1] "DOB" [1] "MAILCODE" [1] " nlevels = 2" [1] "MHUC2" [1] "LASTDATE" [1] "MINRAMNT" > > rm(prev.X,prev.X.names) > rm(f.lrn,f.val,f) > > dimnames(X) <- list(NULL,X.names) > > fit <- lm(y~X,weights=wts) # Note: Validation sample not in fit. > aov <- anova.lm(fit) > > ehat <- fit$residuals > mse.lrn <- sum(ehat[idx.lrn]^2)/n.lrn > mse.val <- sum(ehat[idx.val]^2)/n.val > mse.tst <- sum(ehat[idx.tst]^2)/n.tst > > print(paste(" rank = ",fit$rank)) [1] " rank = 31" > print(paste(" P-value = ",aov[1,5])) [1] " P-value = 8.32540332683308e-75" > print(paste(" mse.lrn = ",mse.lrn)) [1] " mse.lrn = 19.9668158999099" > print(paste(" mse.val = ",mse.val)) [1] " mse.val = 18.6684473002424" > print(paste(" mse.tst = ",mse.tst)) [1] " mse.tst = 17.8144203024274" > > print(summary.lm(fit)) Call: lm(formula = y ~ X, weights = wts) Residuals: Min 1Q Median 3Q Max -22.0056 -0.8254 -0.5873 0.0000 199.2404 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.904e+01 3.400e+00 -5.600 2.15e-08 *** XLASTGIFT 2.128e-02 1.415e-03 15.044 < 2e-16 *** XPEPSTRFL 1.964e-01 3.746e-02 5.242 1.59e-07 *** XAZ 5.679e-02 1.134e-01 0.501 0.616408 XCA 3.303e-01 5.090e-02 6.491 8.60e-11 *** XCO 1.861e-01 1.236e-01 1.505 0.132310 XFL 7.588e-02 6.609e-02 1.148 0.250873 XIA -1.082e-01 1.506e-01 -0.718 0.472713 XKS -4.439e-03 1.521e-01 -0.029 0.976725 XKY -1.319e-02 1.369e-01 -0.096 0.923241 XLA -4.175e-02 1.370e-01 -0.305 0.760579 XMI 2.779e-02 7.768e-02 0.358 0.720579 XMO 5.463e-02 1.064e-01 0.513 0.607743 XMT 8.673e-02 2.363e-01 0.367 0.713578 XNM 1.956e-01 1.839e-01 1.064 0.287454 XNV -1.013e-02 1.773e-01 -0.057 0.954434 XOR 3.230e-01 1.167e-01 2.767 0.005653 ** XS1 -6.318e-01 4.748e-01 -1.331 0.183290 XS3 -1.868e-01 1.293e-01 -1.445 0.148473 XS4 3.847e-02 1.708e-01 0.225 0.821819 XS5 1.810e-01 1.206e-01 1.501 0.133320 XTX 4.126e-02 6.901e-02 0.598 0.549965 XWA 1.852e-01 9.489e-02 1.952 0.050918 . XRECP3 5.769e-01 1.226e-01 4.705 2.54e-06 *** XDOB_miss -2.511e-01 9.644e-02 -2.604 0.009227 ** XDOB_linr 1.985e-04 5.206e-05 3.812 0.000138 *** XDOB_quad -2.632e-08 6.792e-09 -3.876 0.000106 *** XMAILCODE -4.313e-01 1.452e-01 -2.970 0.002984 ** XMHUC2 5.208e-02 2.028e-02 2.567 0.010248 * XLASTDATE 2.005e-03 3.560e-04 5.632 1.79e-08 *** XMINRAMNT -4.378e-03 2.413e-03 -1.814 0.069688 . --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 4.469 on 66869 degrees of freedom Multiple R-Squared: 0.006587, Adjusted R-squared: 0.006142 F-statistic: 14.78 on 30 and 66869 DF, p-value: < 2.2e-16 > > outvec <- fit$fitted.values[idx.lrn] > outlab <- "yhat.lrn" > outfil <- "yhat_lrn_prun.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- fit$fitted.values[idx.val] > outlab <- "yhat.val" > outfil <- "yhat_val_prun.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > outvec <- fit$fitted.values[idx.tst] > outlab <- "yhat.tst" > outfil <- "yhat_tst_prun.dat" > write(c(outlab,formatC(outvec,digits=17,width=26,format="e")),file=outfil) > > rm(outvec,outlab,outfil) > > x0 <- c(0,n.lrn) > y0 <- c(0,sum(y.lrn-0.68)) > > x1 <- (1:n.lrn) > y1 <- fit$fitted.values[1:n.lrn] > y1 <- y1-0.68 > y1 <- sort(y1) > y1 <- y1[n.lrn:1] > y1 <- cumsum(y1) > > idx <- 1:n.lrn > > print(paste("maximum profit in learning sample is ", max(y1))) [1] "maximum profit in learning sample is 11360.3348474610" > print(paste("maximum occurs at ", idx[y1==max(y1)])) [1] "maximum occurs at 40559" > > idx <- seq(1,n.lrn,length=200) > > x1 <- x1[idx] > y1 <- y1[idx] > > source("psopts.r"); > postscript(file="cty_prun_lif.eps"); > > plot(x=c(x0,x1),y=c(y0,y1),ylab="dollars",xlab="size of mailing",type="n") > lines(x=x0,y=y0,col="green") > lines(x=x1,y=y1,col="red") > > dev.off() null device 1 > proc.time() [1] 94.74 7.36 102.55 0.00 0.00 >