sim.dat <- read.csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv ")
summary(sim.dat)# set problematic values as missings
sim.dat$age[which(sim.dat$age>100)]<-NA
sim.dat$store_exp[which(sim.dat$store_exp<0)]<-NA
# see the results
summary(subset(sim.dat,select=c("age","income")))impute() function in imputeMissings package# save the result as another object
demo_imp<-impute(sim.dat,method="median/mode")
# check the first 5 columns, there is no missing values in other columns
summary(demo_imp[,1:5])preProcess() function in caret packageimp<-preProcess(sim.dat,method="medianImpute")
demo_imp2<-predict(imp,sim.dat)
summary(demo_imp2[,1:5])preProcess() function in caret packageimp<-preProcess(sim.dat,method="knnImpute",k=5)
# need to use predict() to get KNN result
demo_imp<-predict(imp,sim.dat)sim.dat has non-numeric variables# find factor columns
imp<-preProcess(sim.dat,method="knnImpute",k=5)
idx<-which(lapply(sim.dat,class)=="factor")
demo_imp<-predict(imp,sim.dat[,-idx])
summary(demo_imp[,1:3])imp<-preProcess(sim.dat,method="bagImpute")
demo_imp<-predict(imp,sim.dat)
summary(demo_imp[,1:5])income<-sim.dat$income
# calculate the mean of income
mux<-mean(income,na.rm=T)
# calculate the standard deviation of income
sdx<-sd(income,na.rm=T)
# centering
tr1<-income-mux
# scaling
tr2<-tr1/sdxpreProcess()sdat<-subset(sim.dat,select=c("age","income"))
# set the "method" option
trans<-preProcess(sdat,method=c("center","scale"))
# use predict() function to get the final result
transformed<-predict(trans,sdat)describe(sim.dat)# select the two columns and save them as dat_bc
dat_bc<-subset(sim.dat,select=c("store_trans","online_trans"))
(trans<-preProcess(dat_bc,method=c("BoxCox")))Use predict() to get the transformed result:
transformed<-predict(trans,dat_bc)\[Z_{i}=\frac{Y_{i}-\bar{Y}}{s}\] where \(\bar{Y}\) and \(s\) are mean and standard deviation for \(Y\)
\[M_{i}=\frac{0.6745(Y_{i}-\bar{Y})}{MAD}\]
where MAD is the median of a series of \(|Y_{i} - \bar{Y}|\), called the median of the absolute dispersion
corrplot()class.ind() from nnet packagedumVar<-class.ind(sim.dat$gender)
head(dumVar)dummyVars() from caretdumMod<-dummyVars(~gender+house+income,
data=sim.dat,
# use "origional variable name + level" as new name
levelsOnly=F)
head(predict(dumMod,sim.dat))