p <- read.table( "pgss_earnings.csv", head = TRUE, sep = ";" )
p <- p[ complete.cases(p), ]
summary(p)
mod1 <- lm( log(should.earn) ~ schooling + age + I(age^2) + gender, data = p, subset = fulltime == 1 )
summary( mod1 )
summary( mod1 )[["coefficients"]]
round( summary( mod1 )[["coefficients"]], 3 )
summary( mod1 )[["r.squared"]]
# The function update()
mod1.bef <- update( mod1, subset = ( fulltime == 1 & year < 2004 ) )
summary(mod1.bef)
mod1.aft <- update( mod1, subset = ( fulltime == 1 & year >= 2004 ) )
summary(mod1.aft)
mod2 <- update( mod1, .~. + year )
summary( mod2 )
summary( mod2 )$r.squared
anova( mod1, mod2 )
mod3 <- update( mod2, .~. + log( do.earn ) )
summary(mod3)
anova( mod2, mod3 )
# Interactions
mod4 <- update( mod3, .~. + gender:year )
summary(mod4)
# Alternatively
mod4a <- update( mod3, .~. - year - gender + gender*year ) # var1*var2 is the same as var1 + var2 + va1:var2
summary(mod4a)
plot( effect( "gender:year", mod4 ) )
## In-class exercise
# 1. Estimate the regression model that has:
# - the log of reported actual earnings as the dependent variable
# - gender, schooling, age, age-squared, and year as the dependent variables.
# Include in the analysis only the subjects that work full time.
# 2. The variable size is a factor that has 5 levels --- check them out using the function levels(). Transform the variable into a variable called city500 that is equal to one for subjects living in cities with population at least 500,000, and 0 otherwise.
# 3. Update your regression model by adding the variable city250 into it. Has the fit improved? How do you interpret the result?
# 4. Check for the possibility that the variables gender and schooling interact. Is the interaction effect significant? Plot the interaction effect using the function effect() in the package effects.