Chapter 8

```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE) ``` ```{r, echo=F, message=F, warning=F} library(readr) library(openintro) library(here) library(tidyverse) library(xtable) library(broom) library(faraway) data(COL) ``` # Line fitting, residuls, and correlation ## Modeling numerical variables In this unit, we will learn to quantify the relationship between two numerical variables, as well as modeling numerical response variables using a numerical or categorical explanatory variable. ## Powerty vs. HS graduate rate The **scatterplot** below shows the relationship between HS graduate rate in all 50 US states and DC and the % of residents who live below the poverty line (income below \$23,050 for a family of 4 in 2012). \begin{columns} \begin{column}{0.5\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} poverty = read.table("dataset/poverty.txt", header = T, sep = "\t") par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 2, cex.axis = 2) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2]) ``` \end{column} \begin{column}{0.45\textwidth} \alert{Response variable?} \pause \% in poverty \pause \alert{Explanatory variable?} \pause \% HS grad \pause \alert{Relationship?} \pause Linear, negative, moderately strong \end{column} \end{columns} ## The linear model for predicting poverty from high school graduation rate in the US is \centering{$\hat{\text{poverty}} = 64.78-0.62 \times HS_{grad}$} \raggedright The "hat" is used to signify that this is an estimate. ## \alert{The high school graduate rate in Georgia is 85.1\%. What poverty level does the model predict for this state?} \pause \centering{$64.78-0.62 \times 85.1 = 12.018$} ## Eyeballing the line \begin{columns} \begin{column}{0.35\textwidth} \alert{Which of the following appears to be the line that best fits the linear relationship between \% in poverty and \% HS grad? Choose one} \end{column} \begin{column}{0.6\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 1.5, cex.axis = 1.5) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2]) abline(lm(poverty$Poverty ~ poverty$Graduates), col = COL[4], lwd = 3, lty = 1) y1 = lm(poverty$Poverty ~ poverty$Graduates)$coefficients[1] + 2 + (1.1 * lm(poverty$Poverty ~ poverty$Graduates)$coefficients[2]) * poverty$Graduates abline(lm(y1 ~ poverty$Graduates), lwd = 3, col = COL[2], lty = 2) abline(h = 14, lwd = 3, col = COL[5], lty = 3) y2 = 114 - (12/10) * seq(70,100,1) abline(lm(y2 ~ seq(70,100,1)), lwd = 3, col = COL[3], lty = 4) legend("topright", inset = 0.05, c("(a)","(b)","(c)", "(d)"), col = c(COL[4],COL[2],COL[5],COL[3]), lwd = 3, lty = c(1,2,3,4)) ``` \end{column} \end{columns} ## Eyeballing the line \begin{columns} \begin{column}{0.35\textwidth} \alert{Which of the following appears to be the line that best fits the linear relationship between \% in poverty and \% HS grad? Choose one} \textcolor{blue}{Answer:} \textcolor{blue}{(a) Solid Red Line} \end{column} \begin{column}{0.6\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 1.5, cex.axis = 1.5) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2]) abline(lm(poverty$Poverty ~ poverty$Graduates), col = COL[4], lwd = 3, lty = 1) y1 = lm(poverty$Poverty ~ poverty$Graduates)$coefficients[1] + 2 + (1.1 * lm(poverty$Poverty ~ poverty$Graduates)$coefficients[2]) * poverty$Graduates abline(lm(y1 ~ poverty$Graduates), lwd = 3, col = COL[2], lty = 2) abline(h = 14, lwd = 3, col = COL[5], lty = 3) y2 = 114 - (12/10) * seq(70,100,1) abline(lm(y2 ~ seq(70,100,1)), lwd = 3, col = COL[3], lty = 4) legend("topright", inset = 0.05, c("(a)","(b)","(c)", "(d)"), col = c(COL[4],COL[2],COL[5],COL[3]), lwd = 3, lty = c(1,2,3,4)) ``` \end{column} \end{columns} ## Residuals **Residuals** are the leftovers from the model fit: Data = Fit + Residual ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 1.5, cex.axis = 1.5) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2], cex.lab = 2, cex.axis = 2) lm_pov_grad = lm(poverty$Poverty ~ poverty$Graduates) pred = predict(lm_pov_grad) x = poverty$Graduates for(i in 1:length(pred)){ lines(c(x[i],x[i]), c(poverty$Poverty[i],pred[i]), col = COL[2]) } abline(lm_pov_grad, col = COL[4], lwd = 3) ``` ## Residuals Residual is the difference between the observed ($y_i$) and predicted ($\hat{y_i}$) \centering{$e_i = y_i - \hat{y_i}$} \begin{columns} \begin{column}{0.5\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 1.5, cex.axis = 1.5) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2], cex.axis = 2, cex.lab = 2) lm_pov_grad = lm(poverty$Poverty ~ poverty$Graduates) pred = predict(lm_pov_grad) x = poverty$Graduates for(i in c(9,40)){ lines(c(x[i],x[i]), c(poverty$Poverty[i],pred[i]), col = COL[2]) text(x[i]+0.5, poverty$Poverty[i], "y", cex = 1.5, col = "blue") text(x[i]+1.2, mean(c(poverty$Poverty[i],pred[i])), as.character(round(poverty$Poverty[i] - pred[i],2)), cex = 3, col = "orange") text(x[i]-0.5, pred[i], expression(hat(y)), cex = 2, col = COL[4]) } text(x[9], poverty$Poverty[9] + 0.5, "DC", col = COL[2],cex = 1.8) text(x[40], poverty$Poverty[40] - 0.5, "RI", col = COL[2],cex = 1.8) abline(lm_pov_grad, col = COL[4], lwd = 3) ``` \end{column} \begin{column}{0.45\textwidth} \pause \begin{itemize} \item \% living in poverty in DC is 5.44\% more than predicted. \pause \item \% living in poverty in RI is 4.16\% less than predicted. \end{itemize} \end{column} \end{columns} ## Quantifying the relationship - **Correlation** describes the strength of the \underline{linear} association between two variables. \pause - It takes values between -1 (perfect negative) and +1 (perfect positive). \pause - A value of 0 indicates no linear association. ## Guessting the correlation \alert{Which of the following is the best guess for the correlation between \% in poverty and \% HS grad?} \begin{columns} \begin{column}{0.3\textwidth} A) 0.6 B) -0.75 C) -0.1 D) 0.02 C) -1.5 \end{column} \begin{column}{0.65\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 2, cex.axis = 2) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2]) lm_pov_grad = lm(poverty$Poverty ~ poverty$Graduates) abline(lm_pov_grad, col = COL[4], lwd = 3) ``` \end{column} \end{columns} ## Guessting the correlation \alert{Which of the following is the best guess for the correlation between \% in poverty and \% HS grad?} \begin{columns} \begin{column}{0.3\textwidth} A) 0.6 B) \textcolor{red}{-0.75} C) -0.1 D) 0.02 C) -1.5 \end{column} \begin{column}{0.65\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 2, cex.axis = 2) plot(poverty$Poverty ~ poverty$Graduates, ylab = "% in poverty", xlab = "% HS grad", pch=19, col=COL[1,2]) lm_pov_grad = lm(poverty$Poverty ~ poverty$Graduates) abline(lm_pov_grad, col = COL[4], lwd = 3) ``` \end{column} \end{columns} ## Guessting the correlation \alert{Which of the following is the best guess for the correlation between \% in poverty and \% female householder, no husband present?} \begin{columns} \begin{column}{0.3\textwidth} A) 0.1 B) -0.6 C) -0.4 D) 0.9 C) 0.5 \end{column} \begin{column}{0.65\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 2, cex.axis = 2) plot(poverty$Poverty ~ poverty$PercentFemaleHouseholderNoHusbandPresent, ylab = "% in poverty", xlab = "% female householder, no husband present", pch=19, col=COL[1,2]) ``` \end{column} \end{columns} ## Guessting the correlation \alert{Which of the following is the best guess for the correlation between \% in poverty and \% female householder, no husband present?} \begin{columns} \begin{column}{0.3\textwidth} A) 0.1 B) -0.6 C) -0.4 D) 0.9 C) \textcolor{red}{0.5} \end{column} \begin{column}{0.65\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} par(mar=c(4,4,1,1), las=1, mgp=c(2.5,0.7,0), cex.lab = 2, cex.axis = 2) plot(poverty$Poverty ~ poverty$PercentFemaleHouseholderNoHusbandPresent, ylab = "% in poverty", xlab = "% female householder, no husband present", pch=19, col=COL[1,2]) ``` \end{column} \end{columns} ## Assessing the correlation \alert{Which of the following has the strongest correlation, i.e. correlation coeeficient closest to +1 or -1?} \begin{columns} \begin{column}{0.65\textwidth} ```{r, echo=F, message=F, warning=F, out.width="100%",fig.align='center'} set.seed(179) # 4 scatterplots x = seq(0,10,0.1) yNonLin = (x-3)^2 - 4 + rnorm(length(x), mean = 0, sd = 1) yPosLinStrong = 3*x + 10 + rnorm(length(x), mean = 0, sd = 2) yPosLinWeak = 3*x + 10 + rnorm(length(x), mean = 0, sd = 20) yNegLinWeak = -3*x + 10 + rnorm(length(x), mean = 0, sd = 5) par(mar=c(2,1,1,1), las=1, mgp=c(0.5,0.7,0), mfrow = c(2,2), cex.lab = 2, cex.axis = 2) plot(yNonLin ~ x, ylab = "", xlab = "(a)", pch=19, col=COL[1,2], axes = FALSE) box() plot(yPosLinStrong ~ x, ylab = "", xlab = "(b)", pch=19, col=COL[1,2], axes = FALSE) box() plot(yPosLinWeak ~ x, ylab = "", xlab = "(c)", pch=19, col=COL[1,2], axes = FALSE) box() plot(yNegLinWeak ~ x, ylab = "", xlab = "(d)", pch=19, col=COL[1,2], axes = FALSE) box() ``` \end{column} \begin{column}{0.3\textwidth} \pause \alert{(b) \rightarrow correlation means \underline{linear} association} \end{column} \end{columns}