Wage
Ordering boxplots
By marital status
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
new_order <- with(Wage, reorder(maritl , wage, median , na.rm=T))
svg("ordered-maritl.svg", width = 11, pointsize = 12, family = "sans")
plot(new_order, Wage$wage)
dev.off()
Finding variable of interest
x must be a numeric vector
Need to exclude region from Wage data.frame because it causes tvals = lm(wage ~ ., data = Wage)
to throw “contrasts can be applied only to factors with 2 or more levels” error.
library(leaps)
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
Wage$region = NULL
regfit.full = regsubsets(wage ~ ., data = Wage)
tvals = lm(wage ~ ., data = Wage)
summary(tvals)
Scatterplots
Because age and wage are both quantitative, R’s basic plot function defaults to a scatterplot.
Subsetting
Subsets of qualitative data, education in this example, can be give different colours using the points function.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
svg("scatterplot-wage.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage,
xlab = "Age", ylab = "Wage", main = "Scatterplot of Education vs Wage",
cex = 1,
pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
dev.off()
Simple linear regression lm(y ~ x)
.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
ab1 = lm(ed1$wage ~ ed1$age)
ab2 = lm(ed2$wage ~ ed2$age)
ab3 = lm(ed3$wage ~ ed3$age)
ab4 = lm(ed4$wage ~ ed4$age)
ab5 = lm(ed5$wage ~ ed5$age)
svg("ablines-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage,
xlab = "Age", ylab = "Wage", main = "Linear prediction by education",
cex = 1,
pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
abline(ab1, col = 2, lwd = 2)
abline(ab2, col = 3, lwd = 2)
abline(ab3, col = 4, lwd = 2)
abline(ab4, col = 5, lwd = 2)
abline(ab5, col = 7, lwd = 2)
dev.off()
Binomial logistic regression lm(y ~ x + I(x^2))
.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
bi1 = lm(ed1$wage ~ ed1$age + I(ed1$age^2))
bi2 = lm(ed2$wage ~ ed2$age + I(ed2$age^2))
bi3 = lm(ed3$wage ~ ed3$age + I(ed3$age^2))
bi4 = lm(ed4$wage ~ ed4$age + I(ed4$age^2))
bi5 = lm(ed5$wage ~ ed5$age + I(ed5$age^2))
pred1 = predict(bi1)
pred2 = predict(bi2)
pred3 = predict(bi3)
pred4 = predict(bi4)
pred5 = predict(bi5)
svg("binomial-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage,
xlab = "Age", ylab = "Wage", main = "Binomial prediction by education",
cex = 1,
pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
ix = sort(ed1$age,index.return=T)$ix
lines(ed1$age[ix], pred1[ix], col = 2, lwd = 2)
ix = sort(ed2$age,index.return=T)$ix
lines(ed2$age[ix], pred2[ix], col = 3, lwd = 2)
ix = sort(ed3$age,index.return=T)$ix
lines(ed3$age[ix], pred3[ix], col = 4, lwd = 2)
ix = sort(ed4$age,index.return=T)$ix
lines(ed4$age[ix], pred4[ix], col = 5, lwd = 2)
ix = sort(ed5$age,index.return=T)$ix
lines(ed5$age[ix], pred5[ix], col = 7, lwd = 2)
dev.off()
Polynomial logistic regression lm(y ~ x + I(x^2) + I(x^3))
.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
pol1 = lm(ed1$wage ~ ed1$age + I(ed1$age^2) + I(ed1$age^3))
pol2 = lm(ed2$wage ~ ed2$age + I(ed2$age^2) + I(ed2$age^3))
pol3 = lm(ed3$wage ~ ed3$age + I(ed3$age^2) + I(ed3$age^3))
pol4 = lm(ed4$wage ~ ed4$age + I(ed4$age^2) + I(ed4$age^3))
pol5 = lm(ed5$wage ~ ed5$age + I(ed5$age^2) + I(ed5$age^3))
pred1 = predict(pol1)
pred2 = predict(pol2)
pred3 = predict(pol3)
pred4 = predict(pol4)
pred5 = predict(pol5)
svg("polynomial-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage,
xlab = "Age", ylab = "Wage", main = "Polynomial prediction by education",
cex = 1,
pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
ix = sort(ed1$age,index.return=T)$ix
lines(ed1$age[ix], pred1[ix], col = 2, lwd = 2)
ix = sort(ed2$age,index.return=T)$ix
lines(ed2$age[ix], pred2[ix], col = 3, lwd = 2)
ix = sort(ed3$age,index.return=T)$ix
lines(ed3$age[ix], pred3[ix], col = 4, lwd = 2)
ix = sort(ed4$age,index.return=T)$ix
lines(ed4$age[ix], pred4[ix], col = 5, lwd = 2)
ix = sort(ed5$age,index.return=T)$ix
lines(ed5$age[ix], pred5[ix], col = 7, lwd = 2)
dev.off()
Because education is qualitative (or a factor in R jargon), plot defaults to a boxplot. If education had been a number, it would have drawn a scatterplot unless education = as.factor(education)
was called.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
svg("boxplot-wage.svg", width = 11, pointsize = 12, family = "sans")
plot(education, wage,
xlab = "Education", ylab = "Wage", main = "Boxplot of Education vs Wage",
col = c(2, 3, 4, 5, 6)
)
dev.off()
R’s lm (linear model) function
lm(wage ~ age)
returns a = 81.7047 and b = 0.7073 which can be used with abline to provide a simple linear regression prediction.
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ab = lm(wage ~ age)
svg("abline-all.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage,
xlab = "Age", ylab = "Wage", main = "a = 81.7047 and b = 0.7073",
bg = c(2, 3, 4, 5, 6),
cex = 1,
pch=21
)
abline(ab, col = "red", lwd = 2)
dev.off()
Simply calling plot(lm.fit)
draws four graphs: Residuals vs Fitted, Q-Q Residuals, Scale-Location, Residuals vs Leverage.
https://r-graph-gallery.com/44-polynomial-curve-fitting.html