In this assignment you will train a Naïve Bayes classifier on categorical data and predict individuals’ incomes.  Import the nbtrain.csv file.  Use the first 9010 records as training data and the remaining 1000 records as testing data.

#section5.5.1TheGroceriesDataset#section5.5.1TheGroceriesDataset

data(Groceries)Groceriessummary(Groceries)class(Groceries)

# display the first 20 grocery labelsGroceries@itemInfo[1:20,]

# display the 10th to 20th transactionsapply(Groceries@data[,10:20], 2,       function(r) paste(Groceries@itemInfo[r,”labels”], collapse=”, “))

#section5.5.2FrequentItemsetGe≠ration#section5.5.2FrequentItemsetGe≠ration

# frequent 1-itemsetsitemsets <- apriori(Groceries, parameter=list(minlen=1, maxlen=1, support=0.02, target=”frequent itemsets”))summary(itemsets)inspect(head(sort(itemsets, by = “support”), 10))

# frequent 2-itemsetsitemsets <- apriori(Groceries, parameter=list(minlen=2, maxlen=2, support=0.02, target=”frequent itemsets”))summary(itemsets)inspect(head(sort(itemsets, by =”support”),10))

# frequent 3-itemsetsitemsets <- apriori(Groceries, parameter=list(minlen=3, maxlen=3, support=0.02, target=”frequent itemsets”))inspect(sort(itemsets, by =”support”))

# frequent 4-itemsetsitemsets <- apriori(Groceries, parameter=list(minlen=4, maxlen=4, support=0.02, target=”frequent itemsets”))inspect(sort(itemsets, by =”support”))

# run Apriori without setting the maxlen parameteritemsets <- apriori(Groceries, parameter=list(minlen=1, support=0.02,                                              target=”frequent itemsets”))

#section5.5.3Re–Ge≠rationandVisualization#section5.5.3Re̲Ge≠rationandVisualization

rules <- apriori(Groceries, parameter=list(support=0.001,                                           confidence=0.6, target = “rules”))summary(rules)

plot(rules)plot(rules@quality)

# displays rules with top lift scoresinspect(head(sort(rules, by=”lift”), 10))

confidentRules <- rules[quality(rules)\$confidence > 0.9]confidentRules

plot(confidentRules, method=”matrix”, measure=c(“lift”, “confidence”),     control=list(reorder=TRUE))

# select the 5 rules with the highest lifthighLiftRules <- head(sort(rules, by=”lift”), 5)

plot(highLiftRules, method=”graph”, control=list(type=”items”))

This code covers the code presented in # Section 8.2 ARIMA Model###

section 8.2.5 Building and Evaluating an ARIMA Model###

install.packages(“forecast”)       # install, if necessarylibrary(forecast)

# read in gasoline production time series# monthly gas production expressed in millions of barrelsgas_prod_input <- as.data.frame( read.csv(“c:/data/gas_prod.csv”) )

# create a time series objectgas_prod <- ts(gas_prod_input[,2])

#examine the time seriesplot(gas_prod, xlab = “Time (months)”,     ylab = “Gasoline production (millions of barrels)”)

# check for conditions of a stationary time seriesplot(diff(gas_prod))abline(a=0, b=0)

# examine ACF and PACF of differenced seriesacf(diff(gas_prod), xaxp = c(0, 48, 4), lag.max=48, main=””)pacf(diff(gas_prod), xaxp = c(0, 48, 4), lag.max=48, main=””)

# fit a (0,1,0)x(1,0,0)12 ARIMA modelarima_1 <- arima (gas_prod,                  order=c(0,1,0),                  seasonal = list(order=c(1,0,0),period=12))arima_1

# it may be necessary to calculate AICc and BIC # http://stats.stackexchange.com/questions/76761/extract-bic-and-aicc-from-arima-objectAIC(arima_1,k = log(length(gas_prod)))   #BIC

# examine ACF and PACF of the (0,1,0)x(1,0,0)12 residualsacf(arima_1\$residuals, xaxp = c(0, 48, 4), lag.max=48, main=””)pacf(arima_1\$residuals, xaxp = c(0, 48, 4), lag.max=48, main=””)

# fit a (0,1,1)x(1,0,0)12 ARIMA modelarima_2 <- arima (gas_prod,                  order=c(0,1,1),                  seasonal = list(order=c(1,0,0),period=12))arima_2

# it may be necessary to calculate AICc and BIC # http://stats.stackexchange.com/questions/76761/extract-bic-and-aicc-from-arima-objectAIC(arima_2,k = log(length(gas_prod)))   #BIC

# examine ACF and PACF of the (0,1,1)x(1,0,0)12 residualsacf(arima_2\$residuals, xaxp = c(0, 48, 4), lag.max=48, main=””)pacf(arima_2\$residuals, xaxp = c(0, 48,4), lag.max=48, main=””)

# Normality and Constant Variance

plot(arima_2\$residuals, ylab = “Residuals”)abline(a=0, b=0)

hist(arima_2\$residuals, xlab=”Residuals”, xlim=c(-20,20))

qqnorm(arima_2\$residuals, main=””)qqline(arima_2\$residuals)

# Forecasting

#predict the next 12 monthsarima_2.predict <- predict(arima_2,n.ahead=12)matrix(c(arima_2.predict\$pred-1.96*arima_2.predict\$se,         arima_2.predict\$pred,         arima_2.predict\$pred+1.96*arima_2.predict\$se), 12,3,       dimnames=list( c(241:252) ,c(“LB”,”Pred”,”UB”)) )

plot(gas_prod, xlim=c(145,252),     xlab = “Time (months)”,     ylab = “Gasoline production (millions of barrels)”,     ylim=c(360,440))lines(arima_2.predict\$pred)lines(arima_2.predict\$pred+1.96*arima_2.predict\$se, col=4, lty=2)lines(arima_2.predict\$pred-1.96*arima_2.predict\$se, col=4, lty=2)

