Wage

August 30, 2025

Ordering boxplots

Ordered By Marital Status

Base plot

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
new_order <- with(Wage, reorder(maritl , wage, median , na.rm=T))
svg("ordered-maritl.svg", width = 11, pointsize = 12, family = "sans")
plot(new_order, Wage$wage)
dev.off()

Ordered boxplot

ggplot2

library(ggplot2)
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
new_order <- with(Wage, reorder(maritl , wage, median , na.rm=T))
svg("ordered-maritl-ggplot2.svg", width = 11, pointsize = 12, family = "sans")
ggplot(data = Wage, mapping = aes(x = new_order, y = wage)) + 
  geom_boxplot(fill = c(2, 3, 7, 4, 8), alpha = 0.2) +
  xlab("Marital Status") +
  ylab("Wage")
dev.off()

Ordered boxplot

Finding variable of interest

x must be a numeric vector

Need to exclude region from Wage data.frame because it causes tvals = lm(wage ~ ., data = Wage) to throw “contrasts can be applied only to factors with 2 or more levels” error.

library(leaps)
Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
Wage$region = NULL
regfit.full = regsubsets(wage ~ ., data = Wage)

tvals = lm(wage ~ ., data = Wage)
summary(tvals)

Scatterplots

Because age and wage are both quantitative, R’s basic plot function defaults to a scatterplot.

Subsetting

Subsets of qualitative data, education in this example, can be give different colours using the points function.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
svg("scatterplot-wage.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage, 
  xlab = "Age", ylab = "Wage", main = "Scatterplot of Education vs Wage",
  cex = 1,
  pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
dev.off()

Scatterplot

Simple linear regression `lm(y ~ x)`.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
ab1 = lm(ed1$wage ~ ed1$age)
ab2 = lm(ed2$wage ~ ed2$age)
ab3 = lm(ed3$wage ~ ed3$age)
ab4 = lm(ed4$wage ~ ed4$age)
ab5 = lm(ed5$wage ~ ed5$age)
svg("ablines-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage, 
  xlab = "Age", ylab = "Wage", main = "Linear prediction by education",
  cex = 1,
  pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
abline(ab1, col = 2, lwd = 2)
abline(ab2, col = 3, lwd = 2)
abline(ab3, col = 4, lwd = 2)
abline(ab4, col = 5, lwd = 2)
abline(ab5, col = 7, lwd = 2)
dev.off()

ablines

Binomial logistic regression `lm(y ~ x + I(x^2))`.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
bi1 = lm(ed1$wage ~ ed1$age + I(ed1$age^2))
bi2 = lm(ed2$wage ~ ed2$age + I(ed2$age^2))
bi3 = lm(ed3$wage ~ ed3$age + I(ed3$age^2))
bi4 = lm(ed4$wage ~ ed4$age + I(ed4$age^2))
bi5 = lm(ed5$wage ~ ed5$age + I(ed5$age^2))
pred1 = predict(bi1)
pred2 = predict(bi2)
pred3 = predict(bi3)
pred4 = predict(bi4)
pred5 = predict(bi5)
svg("binomial-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage, 
  xlab = "Age", ylab = "Wage", main = "Binomial prediction by education",
  cex = 1,
  pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
ix = sort(ed1$age,index.return=T)$ix
lines(ed1$age[ix], pred1[ix], col = 2, lwd = 2)
ix = sort(ed2$age,index.return=T)$ix
lines(ed2$age[ix], pred2[ix], col = 3, lwd = 2)
ix = sort(ed3$age,index.return=T)$ix
lines(ed3$age[ix], pred3[ix], col = 4, lwd = 2)
ix = sort(ed4$age,index.return=T)$ix
lines(ed4$age[ix], pred4[ix], col = 5, lwd = 2)
ix = sort(ed5$age,index.return=T)$ix
lines(ed5$age[ix], pred5[ix], col = 7, lwd = 2)
dev.off()

binomial

Polynomial logistic regression `lm(y ~ x + I(x^2) + I(x^3))`.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ed1 = subset(Wage, education == '1. < HS Grad')
ed2 = subset(Wage, education == '2. HS Grad')
ed3 = subset(Wage, education == '3. Some College')
ed4 = subset(Wage, education == '4. College Grad')
ed5 = subset(Wage, education == '5. Advanced Degree')
pol1 = lm(ed1$wage ~ ed1$age + I(ed1$age^2) + I(ed1$age^3))
pol2 = lm(ed2$wage ~ ed2$age + I(ed2$age^2) + I(ed2$age^3))
pol3 = lm(ed3$wage ~ ed3$age + I(ed3$age^2) + I(ed3$age^3))
pol4 = lm(ed4$wage ~ ed4$age + I(ed4$age^2) + I(ed4$age^3))
pol5 = lm(ed5$wage ~ ed5$age + I(ed5$age^2) + I(ed5$age^3))
pred1 = predict(pol1)
pred2 = predict(pol2)
pred3 = predict(pol3)
pred4 = predict(pol4)
pred5 = predict(pol5)
svg("polynomial-multiple.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage, 
  xlab = "Age", ylab = "Wage", main = "Polynomial prediction by education",
  cex = 1,
  pch=21
)
points(ed1$age, ed1$wage, bg = 2, cex = 1, pch = 21)
points(ed2$age, ed2$wage, bg = 3, cex = 1, pch = 21)
points(ed3$age, ed3$wage, bg = 4, cex = 1, pch = 21)
points(ed4$age, ed4$wage, bg = 5, cex = 1, pch = 21)
points(ed5$age, ed5$wage, bg = 7, cex = 1, pch = 21)
ix = sort(ed1$age,index.return=T)$ix
lines(ed1$age[ix], pred1[ix], col = 2, lwd = 2)
ix = sort(ed2$age,index.return=T)$ix
lines(ed2$age[ix], pred2[ix], col = 3, lwd = 2)
ix = sort(ed3$age,index.return=T)$ix
lines(ed3$age[ix], pred3[ix], col = 4, lwd = 2)
ix = sort(ed4$age,index.return=T)$ix
lines(ed4$age[ix], pred4[ix], col = 5, lwd = 2)
ix = sort(ed5$age,index.return=T)$ix
lines(ed5$age[ix], pred5[ix], col = 7, lwd = 2)
dev.off()

polynomial

Because education is qualitative (or a factor in R jargon), plot defaults to a boxplot. If education had been a number, it would have drawn a scatterplot unless education = as.factor(education) was called.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
svg("boxplot-wage.svg", width = 11, pointsize = 12, family = "sans")
plot(education, wage, 
  xlab = "Education", ylab = "Wage", main = "Boxplot of Education vs Wage",
  col = c(2, 3, 4, 5, 6)
)
dev.off()

Boxplot

R’s lm (linear model) function

lm(wage ~ age) returns a = 81.7047 and b = 0.7073 which can be used with abline to provide a simple linear regression prediction.

Wage = read.csv("Wage.csv", header = T, na.strings = "?", stringsAsFactors = T)
attach(Wage)
ab = lm(wage ~ age)
svg("abline-all.svg", width = 11, pointsize = 12, family = "sans")
plot(age, wage, 
  xlab = "Age", ylab = "Wage", main = "a = 81.7047 and b = 0.7073",
  bg = c(2, 3, 4, 5, 6),
  cex = 1,
  pch=21
)
abline(ab, col = "red", lwd = 2)
dev.off()

abline

Simply calling plot(lm.fit) draws four graphs: Residuals vs Fitted, Q-Q Residuals, Scale-Location, Residuals vs Leverage.

https://r-graph-gallery.com/44-polynomial-curve-fitting.html