Linear Regression I

Bikeshare Data

Author

Byeong-Hak Choe

Published

February 8, 2026

Modified

February 10, 2026

R packages

library(tidyverse)
library(broom)
library(stargazer)
library(skimr)
library(DT)

Reading a CSV File

bikeshare <- read_csv('https://bcdanl.github.io/data/bikeshare_cleaned.csv')


Variable description

Variable Description
cnt Count of total rental bikes
year Year
month Month
date Date
hr Hour
wkday Weekday
holiday Holiday indicator (1 if holiday, 0 otherwise)
seasons Season
weather_cond Weather condition
temp Temperature (measured in standard deviations from average)
hum Humidity (measured in standard deviations from average)
windspeed Wind speed (measured in standard deviations from average)

Continuous variables

  • cnt
  • temp
  • hum
  • windspeed

Categorical variables

  • year
  • month
  • date
  • hr
  • wkday
  • holiday
  • seasons
  • weather_cond

Descriptive Statistics

skim(bikeshare)
Data summary
Name bikeshare
Number of rows 17376
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
month 0 1 2 2 0 12 0
date 0 1 2 2 0 31 0
wkday 0 1 6 9 0 7 0
seasons 0 1 4 6 0 4 0
weather_cond 0 1 14 24 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cnt 0 1 189.48 181.40 1.00 40.00 142.00 281.00 977.00 ▇▃▁▁▁
year 0 1 2011.50 0.50 2011.00 2011.00 2012.00 2012.00 2012.00 ▇▁▁▁▇
hr 0 1 11.55 6.91 0.00 6.00 12.00 18.00 23.00 ▇▇▆▇▇
holiday 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
temp 0 1 0.00 1.00 -2.48 -0.82 0.02 0.85 2.61 ▂▇▇▇▁
hum 0 1 0.00 1.00 -3.25 -0.76 0.01 0.79 1.93 ▁▃▇▇▆
windspeed 0 1 0.00 1.00 -1.55 -0.70 0.03 0.52 5.40 ▇▆▂▁▁

Histograms

bikeshare |> 
  ggplot(aes(x = cnt)) +
  geom_histogram()

Categorical Variables

bikeshare |> 
  count(year) |> 
  datatable()
bikeshare |> 
  count(month) |> 
  datatable()
bikeshare |> 
  count(date) |> 
  datatable()
bikeshare |> 
  count(hr) |> 
  datatable()
bikeshare |> 
  count(wkday) |> 
  datatable()
bikeshare |> 
  count(holiday) |> 
  datatable()
bikeshare |> 
  count(seasons) |> 
  datatable()
bikeshare |> 
  count(weather_cond) |> 
  datatable()

Data Preparation

bikeshare <- bikeshare |> 
  mutate(year = factor(year),
         year = fct_relevel(year, "2011"),
         seasons = factor(seasons,
                          levels = 
                            c("spring", 
                              "summer", 
                              "fall",
                              "winter")),
         month = factor(month),
         month = fct_relevel(month, "01"),
         hr = factor(hr,
                     levels = 0:23),
         wkday = factor(wkday,
                          levels = 
                            c("sunday", "monday", "tuesday", "wednesday", 
                              "thursday", "friday", "saturday")),
         weather_cond = factor(weather_cond,
                          levels = 
                            c("Clear or Few Cloudy", 
                              "Light Snow or Light Rain",
                              "Mist or Cloudy")),
         )

Training and Test Data

set.seed(1)

bikeshare <- bikeshare |> 
  mutate(rnd = runif(n()))

dtrain <- bikeshare |> 
  filter(rnd > 0.4)

dtest <- bikeshare |> 
  filter(rnd <= 0.4)

nrow(dtrain) / nrow(bikeshare)
[1] 0.5965124



Linear Regression Model

\[ \begin{align} \text{cnt}_{i} =\ &\beta_{\text{intercept}}\\ &+ \beta_{\text{temp}} \, \text{temp}_{i} + \beta_{\text{hum}} \, \text{hum}_{i} + \beta_{\text{windspeed}} \, \text{windspeed}_{i} \nonumber \\ &+ \beta_{\text{year\_2012}} \, \text{year\_2012}_{i}\\ &+ \beta_{\text{month\_2}} \, \text{month\_2}_{i} + \beta_{\text{month\_3}} \, \text{month\_3}_{i} + \beta_{\text{month\_4}} \, \text{month\_4}_{i} \nonumber \\ &+ \beta_{\text{month\_5}} \, \text{month\_5}_{i} + \beta_{\text{month\_6}} \, \text{month\_6}_{i} + \beta_{\text{month\_7}} \, \text{month\_7}_{i} + \beta_{\text{month\_8}} \, \text{month\_8}_{i} \nonumber \\ &+ \beta_{\text{month\_9}} \, \text{month\_9}_{i} + \beta_{\text{month\_10}} \, \text{month\_10}_{i} + \beta_{\text{month\_11}} \, \text{month\_11}_{i} + \beta_{\text{month\_12}} \, \text{month\_12}_{i} \nonumber \\ &+ \beta_{\text{hr\_1}} \, \text{hr\_1}_{i} + \beta_{\text{hr\_2}} \, \text{hr\_2}_{i} + \beta_{\text{hr\_3}} \, \text{hr\_3}_{i} + \beta_{\text{hr\_4}} \, \text{hr\_4}_{i} \nonumber \\ &+ \beta_{\text{hr\_5}} \, \text{hr\_5}_{i} + \beta_{\text{hr\_6}} \, \text{hr\_6}_{i} + \beta_{\text{hr\_7}} \, \text{hr\_7}_{i} + \beta_{\text{hr\_8}} \, \text{hr\_8}_{i} \nonumber \\ &+ \beta_{\text{hr\_9}} \, \text{hr\_9}_{i} + \beta_{\text{hr\_10}} \, \text{hr\_10}_{i} + \beta_{\text{hr\_11}} \, \text{hr\_11}_{i} + \beta_{\text{hr\_12}} \, \text{hr\_12}_{i} \nonumber \\ &+ \beta_{\text{hr\_13}} \, \text{hr\_13}_{i} + \beta_{\text{hr\_14}} \, \text{hr\_14}_{i} + \beta_{\text{hr\_15}} \, \text{hr\_15}_{i} + \beta_{\text{hr\_16}} \, \text{hr\_16}_{i} \nonumber \\ &+ \beta_{\text{hr\_17}} \, \text{hr\_17}_{i} + \beta_{\text{hr\_18}} \, \text{hr\_18}_{i} + \beta_{\text{hr\_19}} \, \text{hr\_19}_{i} + \beta_{\text{hr\_20}} \, \text{hr\_20}_{i} \nonumber \\ &+ \beta_{\text{hr\_21}} \, \text{hr\_21}_{i} + \beta_{\text{hr\_22}} \, \text{hr\_22}_{i} + \beta_{\text{hr\_23}} \, \text{hr\_23}_{i} \nonumber \\ &+ \beta_{\text{wkday\_monday}} \, \text{wkday\_monday}_{i} + \beta_{\text{wkday\_tuesday}} \, \text{wkday\_tuesday}_{i} + \beta_{\text{wkday\_wednesday}} \, \text{wkday\_wednesday}_{i} \nonumber \\ &+ \beta_{\text{wkday\_thursday}} \, \text{wkday\_thursday}_{i} + \beta_{\text{wkday\_friday}} \, \text{wkday\_friday}_{i} + \beta_{\text{wkday\_saturday}} \, \text{wkday\_saturday}_{i} \nonumber \\ &+ \beta_{\text{holiday\_1}} \, \text{holiday\_1}_{i} \nonumber \\ &+ \beta_{\text{seasons\_summer}} \, \text{seasons\_summer}_{i} + \beta_{\text{seasons\_fall}} \, \text{seasons\_fall}_{i} + \beta_{\text{seasons\_winter}} \, \text{seasons\_winter}_{i} \nonumber \\ &+ \beta_{\text{weather\_cond\_Light\_Snow\_or\_Light\_Rain}} \, \text{weather\_cond\_Light\_Snow\_or\_Light\_Rain}_{i}\nonumber \\ &+ \beta_{\text{weather\_cond\_Mist\_or\_Cloudy}} \, \text{weather\_cond\_Mist\_or\_Cloudy}_{i}\\ &+ \epsilon_{i} \end{align} \]

Note that all predictors are dummy variables, except for temp, hum, and windspeed.

Training the Model

model <- lm(cnt ~ temp + hum + windspeed + 
                  year +
                  month + 
                  hr + 
                  wkday +
                  holiday +
                  seasons +
                  weather_cond,
            data = dtrain)

Regression Table with stargazer()

stargazer(model, type = "html")
Dependent variable:
cnt
temp 45.480***
(2.350)
hum -17.192***
(1.386)
windspeed -4.965***
(1.083)
year2012 86.236***
(2.026)
month02 4.601
(5.103)
month03 10.228*
(5.705)
month04 3.197
(8.457)
month05 17.549*
(9.049)
month06 3.907
(9.302)
month07 -12.534
(10.479)
month08 8.047
(10.143)
month09 31.316***
(9.047)
month10 13.305
(8.429)
month11 -13.280
(8.110)
month12 -11.074*
(6.421)
hr1 -15.375**
(6.960)
hr2 -24.699***
(6.939)
hr3 -40.640***
(7.015)
hr4 -41.506***
(7.036)
hr5 -24.365***
(6.952)
hr6 34.893***
(6.883)
hr7 166.134***
(6.915)
hr8 319.314***
(6.965)
hr9 162.039***
(6.868)
hr10 111.098***
(6.932)
hr11 137.942***
(6.941)
hr12 168.066***
(7.012)
hr13 164.771***
(7.317)
hr14 145.958***
(7.153)
hr15 165.300***
(7.188)
hr16 222.334***
(7.217)
hr17 378.852***
(7.058)
hr18 345.427***
(7.064)
hr19 233.315***
(6.954)
hr20 155.840***
(7.014)
hr21 108.184***
(6.980)
hr22 70.452***
(6.943)
hr23 27.440***
(6.927)
wkdaymonday 8.383**
(3.865)
wkdaytuesday 11.273***
(3.734)
wkdaywednesday 16.442***
(3.732)
wkdaythursday 12.750***
(3.757)
wkdayfriday 18.119***
(3.719)
wkdaysaturday 16.427***
(3.693)
holiday -31.995***
(6.466)
seasonssummer 38.039***
(6.275)
seasonsfall 25.825***
(7.428)
seasonswinter 73.019***
(6.360)
weather_condLight Snow or Light Rain -64.598***
(4.145)
weather_condMist or Cloudy -10.930***
(2.487)
Constant -19.330***
(7.497)
Observations 10,365
R2 0.692
Adjusted R2 0.691
Residual Std. Error 101.607 (df = 10314)
F Statistic 463.994*** (df = 50; 10314)
Note: p<0.1; p<0.05; p<0.01

Beta Estimates with tidy()

model_betas <- tidy(model, 
                    conf.int = T)  # conf.level = 0.95 (default)

model_betas_90ci <- tidy(model, 
                       conf.int = T,
                       conf.level = 0.90)

model_betas_99ci <- tidy(model, 
                       conf.int = T,
                       conf.level = 0.99)

rmarkdown::paged_table(model_betas)
# coef(model) returns a vector of beta estimates:
# coef(model)

Prediction with augment()

model_pred_train <- augment(model)

rmarkdown::paged_table(model_pred_train)
model_pred_test <- augment(model, newdata = dtest)

rmarkdown::paged_table(model_pred_test)

Various Model Statistics with glance()

model_r2 <- glance(model)

rmarkdown::paged_table(model_r2)

Coefficient Plots

model_betas |> 
  filter(str_detect(term, "hr")) |> 
  mutate(term = factor(term,
                       levels = str_c("hr", 1:23))
         )|> 
  ggplot(
    aes(xmin = conf.low, 
        xmax = conf.high,
        x = estimate,
        y = term)
  ) +
  geom_pointrange() +
  geom_point() +
  geom_vline(xintercept = 0, color = "darkorange", linetype = 2) +
  labs(y = "")

# Add a CI label to each and row-bind
month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "hr")) |>
  mutate(term = factor(term,
                       levels = str_c("hr", 1:23))
         )|> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Hour effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(term %in% c("temp", "hum", "windspeed")) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Weather effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "month")) |>
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Month effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "wkday")) |> 
  mutate(term = str_replace_all(term, "wkday", ""),
         term = factor(term,
                          levels = 
                            c("monday", "tuesday", "wednesday", 
                              "thursday", "friday", "saturday"))) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Day effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "seasons")) |> 
  mutate(term = str_replace_all(term, "seasons", ""),
         term = factor(term,
                          levels = 
                            c("summer", "fall", "winter"))) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Season effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_betas_90ci |> mutate(ci = "90%"),
  model_betas      |> mutate(ci = "95%"),
  model_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "weather_cond")) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = estimate,
    xmin = conf.low,
    xmax = conf.high,
    color = ci
  )
) +
  geom_vline(xintercept = 0, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Weather condition effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

Residual Plot

model_pred_test |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot for Test Data"
  )

model_pred_train |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot for Training Data"
  )



Linear Regression Model with Log-Transformed Outcome

Histograms

bikeshare |> 
  ggplot(aes(x = cnt)) +
  geom_histogram()

bikeshare |> 
  ggplot(aes(x = log(cnt))) +
  geom_histogram()

\[ \begin{align} \log(\text{cnt}_{i}) =\ &\beta_{\text{intercept}}\\ &+ \beta_{\text{temp}} \, \text{temp}_{i} + \beta_{\text{hum}} \, \text{hum}_{i} + \beta_{\text{windspeed}} \, \text{windspeed}_{i} \nonumber \\ &+ \beta_{\text{year\_2012}} \, \text{year\_2012}_{i}\\ &+ \beta_{\text{month\_2}} \, \text{month\_2}_{i} + \beta_{\text{month\_3}} \, \text{month\_3}_{i} + \beta_{\text{month\_4}} \, \text{month\_4}_{i} \nonumber \\ &+ \beta_{\text{month\_5}} \, \text{month\_5}_{i} + \beta_{\text{month\_6}} \, \text{month\_6}_{i} + \beta_{\text{month\_7}} \, \text{month\_7}_{i} + \beta_{\text{month\_8}} \, \text{month\_8}_{i} \nonumber \\ &+ \beta_{\text{month\_9}} \, \text{month\_9}_{i} + \beta_{\text{month\_10}} \, \text{month\_10}_{i} + \beta_{\text{month\_11}} \, \text{month\_11}_{i} + \beta_{\text{month\_12}} \, \text{month\_12}_{i} \nonumber \\ &+ \beta_{\text{hr\_1}} \, \text{hr\_1}_{i} + \beta_{\text{hr\_2}} \, \text{hr\_2}_{i} + \beta_{\text{hr\_3}} \, \text{hr\_3}_{i} + \beta_{\text{hr\_4}} \, \text{hr\_4}_{i} \nonumber \\ &+ \beta_{\text{hr\_5}} \, \text{hr\_5}_{i} + \beta_{\text{hr\_6}} \, \text{hr\_6}_{i} + \beta_{\text{hr\_7}} \, \text{hr\_7}_{i} + \beta_{\text{hr\_8}} \, \text{hr\_8}_{i} \nonumber \\ &+ \beta_{\text{hr\_9}} \, \text{hr\_9}_{i} + \beta_{\text{hr\_10}} \, \text{hr\_10}_{i} + \beta_{\text{hr\_11}} \, \text{hr\_11}_{i} + \beta_{\text{hr\_12}} \, \text{hr\_12}_{i} \nonumber \\ &+ \beta_{\text{hr\_13}} \, \text{hr\_13}_{i} + \beta_{\text{hr\_14}} \, \text{hr\_14}_{i} + \beta_{\text{hr\_15}} \, \text{hr\_15}_{i} + \beta_{\text{hr\_16}} \, \text{hr\_16}_{i} \nonumber \\ &+ \beta_{\text{hr\_17}} \, \text{hr\_17}_{i} + \beta_{\text{hr\_18}} \, \text{hr\_18}_{i} + \beta_{\text{hr\_19}} \, \text{hr\_19}_{i} + \beta_{\text{hr\_20}} \, \text{hr\_20}_{i} \nonumber \\ &+ \beta_{\text{hr\_21}} \, \text{hr\_21}_{i} + \beta_{\text{hr\_22}} \, \text{hr\_22}_{i} + \beta_{\text{hr\_23}} \, \text{hr\_23}_{i} \nonumber \\ &+ \beta_{\text{wkday\_monday}} \, \text{wkday\_monday}_{i} + \beta_{\text{wkday\_tuesday}} \, \text{wkday\_tuesday}_{i} + \beta_{\text{wkday\_wednesday}} \, \text{wkday\_wednesday}_{i} \nonumber \\ &+ \beta_{\text{wkday\_thursday}} \, \text{wkday\_thursday}_{i} + \beta_{\text{wkday\_friday}} \, \text{wkday\_friday}_{i} + \beta_{\text{wkday\_saturday}} \, \text{wkday\_saturday}_{i} \nonumber \\ &+ \beta_{\text{holiday\_1}} \, \text{holiday\_1}_{i} \nonumber \\ &+ \beta_{\text{seasons\_summer}} \, \text{seasons\_summer}_{i} + \beta_{\text{seasons\_fall}} \, \text{seasons\_fall}_{i} + \beta_{\text{seasons\_winter}} \, \text{seasons\_winter}_{i} \nonumber \\ &+ \beta_{\text{weather\_cond\_Light\_Snow\_or\_Light\_Rain}} \, \text{weather\_cond\_Light\_Snow\_or\_Light\_Rain}_{i}\nonumber \\ &+ \beta_{\text{weather\_cond\_Mist\_or\_Cloudy}} \, \text{weather\_cond\_Mist\_or\_Cloudy}_{i}\\ &+ \epsilon_{i} \end{align} \]

Training the Model

model_log <- lm(log(cnt) ~ temp + hum + windspeed + 
                  year +
                  month + 
                  hr + 
                  wkday +
                  holiday +
                  seasons +
                  weather_cond,
            data = dtrain)

Regression Table with stargazer()

stargazer(model_log, type = "html")
Dependent variable:
log(cnt)
temp 0.277***
(0.014)
hum -0.057***
(0.009)
windspeed -0.037***
(0.007)
year2012 0.486***
(0.012)
month02 0.146***
(0.031)
month03 0.114***
(0.035)
month04 0.064
(0.052)
month05 0.208***
(0.056)
month06 0.078
(0.057)
month07 -0.093
(0.065)
month08 0.001
(0.062)
month09 0.070
(0.056)
month10 -0.018
(0.052)
month11 -0.097*
(0.050)
month12 -0.084**
(0.040)
hr1 -0.606***
(0.043)
hr2 -1.125***
(0.043)
hr3 -1.794***
(0.043)
hr4 -2.021***
(0.043)
hr5 -0.993***
(0.043)
hr6 0.241***
(0.042)
hr7 1.239***
(0.043)
hr8 1.916***
(0.043)
hr9 1.574***
(0.042)
hr10 1.252***
(0.043)
hr11 1.386***
(0.043)
hr12 1.524***
(0.043)
hr13 1.507***
(0.045)
hr14 1.413***
(0.044)
hr15 1.507***
(0.044)
hr16 1.730***
(0.044)
hr17 2.142***
(0.043)
hr18 2.042***
(0.043)
hr19 1.760***
(0.043)
hr20 1.480***
(0.043)
hr21 1.245***
(0.043)
hr22 0.984***
(0.043)
hr23 0.581***
(0.043)
wkdaymonday -0.037
(0.024)
wkdaytuesday -0.039*
(0.023)
wkdaywednesday -0.010
(0.023)
wkdaythursday 0.036
(0.023)
wkdayfriday 0.122***
(0.023)
wkdaysaturday 0.109***
(0.023)
holiday -0.156***
(0.040)
seasonssummer 0.339***
(0.039)
seasonsfall 0.406***
(0.046)
seasonswinter 0.673***
(0.039)
weather_condLight Snow or Light Rain -0.603***
(0.026)
weather_condMist or Cloudy -0.054***
(0.015)
Constant 3.145***
(0.046)
Observations 10,365
R2 0.824
Adjusted R2 0.823
Residual Std. Error 0.626 (df = 10314)
F Statistic 967.908*** (df = 50; 10314)
Note: p<0.1; p<0.05; p<0.01

Beta Estimates with tidy()

model_log_betas <- tidy(model_log, 
                        conf.int = T)  # conf.level = 0.95 (default)

model_log_betas_90ci <- tidy(model_log, 
                             conf.int = T,
                             conf.level = 0.90)

model_log_betas_99ci <- tidy(model_log, 
                             conf.int = T,
                             conf.level = 0.99)
# coef(model_log) returns a vector of beta estimates:
# coef(model_log)

Prediction with augment()

model_log_pred_train <- augment(model_log)
model_log_pred_test <- augment(model_log, newdata = dtest)

Various Model Statistics with glance()

model_log_r2 <- glance(model_log)

Coefficient Plots

model_log_betas |> 
  filter(str_detect(term, "hr")) |> 
  mutate(term = factor(term,
                       levels = str_c("hr", 1:23))
         )|> 
  ggplot(
    aes(xmin = exp(conf.low), 
        xmax = exp(conf.high),
        x = exp(estimate),
        y = term)
  ) +
  geom_pointrange() +
  geom_point() +
  geom_vline(xintercept = 1, color = "darkorange", linetype = 2) +
  labs(y = "")

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "hr")) |>
  mutate(term = factor(term,
                       levels = str_c("hr", 1:23))
         )|> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Hour effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(term %in% c("temp", "hum", "windspeed")) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Weather effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "month")) |>
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Month effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "wkday")) |> 
  mutate(term = str_replace_all(term, "wkday", ""),
         term = factor(term,
                          levels = 
                            c("monday", "tuesday", "wednesday", 
                              "thursday", "friday", "saturday"))) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Day effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "seasons")) |> 
  mutate(term = str_replace_all(term, "seasons", ""),
         term = factor(term,
                          levels = 
                            c("summer", "fall", "winter"))) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Season effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

month_ci <- bind_rows(
  model_log_betas_90ci |> mutate(ci = "90%"),
  model_log_betas      |> mutate(ci = "95%"),
  model_log_betas_99ci |> mutate(ci = "99%")
) |>
  filter(str_detect(term, "weather_cond")) |> 
  mutate(ci = factor(ci, levels = c("90%", "95%", "99%")))


ggplot(
  month_ci,
  aes(
    y = term,
    x = exp(estimate),
    xmin = exp(conf.low),
    xmax = exp(conf.high),
    color = ci
  )
) +
  geom_vline(xintercept = 1, color = "maroon", linetype = 2) +
  geom_pointrange(
    position = position_dodge(width = 0.6)
  ) +
  labs(
    title = "Weather condition effects with multiple confidence intervals",
    color = "CI level",
    y = ""
  ) 

Residual Plot

model_log_pred_test |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot with Test Data"
  )

model_log_pred_train |> 
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.25) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    x = "Fitted values",
    y = "Residuals",
    title = "Residual Plot with Training Data"
  )

Back to top