Code in Chapter 01: Data, Models and Software
Example of a Conceptual Model (adapted from Fig. 15.1 in Greed, 2000)
Example of a Conceptual Model (adapted from Fig. 1.2 in Ortúzar and Willumsen, 2011)
# Function `print()` displays the argument on the screen. 
# `print()` is a generic method in `R` which means that many types
# of objects can be printed, including a string as in the example
# below:
print("Hello, Discrete Choice Analysis!")## [1] "Hello, Discrete Choice Analysis!"# Function `rm()` removes objects from the _environment_
# that is objects currently in memory. The argument list = ls()
# removes all objects currently in memory
rm(list = ls())library(discrtr) # A companion package for the book Introduction to Discrete Choice Analysis with `R`
library(dplyr) # A Grammar of Data Manipulation
library(ggplot2) # Create Elegant Data Visualisations Using the Grammar of Graphics
library(mlogit) # Multinomial Logit Models 
library(readr) # Read Rectangular Text Data 
library(stargazer) # Well-Formatted Regression and Summary Statistics Tables# Read a csv file data and name the object
mc_mode_choice <- read_csv(system.file("extdata", 
                                       "mc_commute.csv", 
                                       package = "discrtr"),
                           show_col_types = FALSE)# `head()` displays the first few rows of a data object
# Indexing of the object in this example is used to display
# only columns 1 through 4
head(mc_mode_choice[,1:4]) ## # A tibble: 6 × 4
##   RespondentID choice avcycle avwalk
##          <dbl>  <dbl>   <dbl>  <dbl>
## 1    566872636      3       0      1
## 2    566873140      3       0      1
## 3    566874266      3       0      0
## 4    566874842      2       1      1
## 5    566881170      2       1      1
## 6    566907438      2       0      1# `stargazer()` takes as an input a data frame
stargazer(as.data.frame(mc_mode_choice[,1:5]), 
          # change the type to text, html, or latex depending on the desired output
          type = "latex", 
          header = FALSE, # do not print package version info in the output
          title = "Example of a table with summary statistics", # Title of table
          omit.summary.stat = c("N", 
                                "median"), # summary statistics to omit from output
          font.size = "small") # font size can be changed# Indexing allows us to choose parts of a data object
# In this example, we are extracting the first row of
# column `choice` in table `mc_mode_choice` and then
# The fourth row of the same column
mc_mode_choice$choice[1] - mc_mode_choice$choice[4]## [1] 1# Function `factor()` is used to convert a variable (which could be character or numeric)
# into a factor, that is, a label or category; when we want a factor to be ordered (treated
# as an ordinal variable) we specify argument ordered = TRUE. Non-ordinal variables by default
# are displayed alphabetically, but changing their order when  specifying the labels changes
# the order they are displayed _without necessarily making them ordinal_
mc_mode_choice$choice <- factor(mc_mode_choice$choice, 
                                labels = c("Cycle", 
                                           "Walk", 
                                           "HSR", 
                                           "Car"))summary(mc_mode_choice$choice)## Cycle  Walk   HSR   Car 
##    48   711   336   281mc_mode_choice$choice[1] - mc_mode_choice$choice[4]## Warning in Ops.factor(mc_mode_choice$choice[1], mc_mode_choice$choice[4]): '-'
## not meaningful for factors## [1] NAsummary(mc_mode_choice$timecycle)##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.29      3.79      5.83  34014.86 100000.00 100000.00# Find the class of an object
class(mc_mode_choice$choice)## [1] "factor"class(mc_mode_choice$timecycle)## [1] "numeric"mc_mode_choice[2, 2]## # A tibble: 1 × 1
##   choice
##   <fct> 
## 1 HSRmc_mode_choice$choice[2]## [1] HSR
## Levels: Cycle Walk HSR Carmc_mode_choice[["choice"]][2]## [1] HSR
## Levels: Cycle Walk HSR Carmc_mode_choice[2:5, 7:8]## # A tibble: 4 × 2
##   timecycle timewalk
##       <dbl>    <dbl>
## 1      3.73     12.8
## 2 100000    100000  
## 3      5.83     20  
## 4      5.83     20time.Cycle.clean <- mc_mode_choice$timecycle[mc_mode_choice$timecycle != 100000]class(time.Cycle.clean)## [1] "numeric"summary(time.Cycle.clean)##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2914  2.9141  4.3711  4.8957  5.8282 45.0000time.Active.clean <- mc_mode_choice %>% # Pipe data frame `mc_mode_choice`
  select(c("timecycle", # Select columns from the data frame that was piped
           "timewalk")) %>% 
  filter(timecycle != 100000 & timewalk != 100000) # Filter observations that are _not_ 100000time.Active.clean.the.hard.way <- mc_mode_choice[mc_mode_choice$timecycle != 100000 & 
                                                   mc_mode_choice$timewalk != 100000, 7:8]summary(time.Active.clean)##    timecycle          timewalk    
##  Min.   : 0.2914   Min.   : 1.00  
##  1st Qu.: 2.9141   1st Qu.:10.00  
##  Median : 4.3711   Median :15.00  
##  Mean   : 4.5852   Mean   :16.10  
##  3rd Qu.: 5.8282   3rd Qu.:20.00  
##  Max.   :17.4845   Max.   :62.11summary(time.Active.clean.the.hard.way)##    timecycle          timewalk    
##  Min.   : 0.2914   Min.   : 1.00  
##  1st Qu.: 2.9141   1st Qu.:10.00  
##  Median : 4.3711   Median :15.00  
##  Mean   : 4.5852   Mean   :16.10  
##  3rd Qu.: 5.8282   3rd Qu.:20.00  
##  Max.   :17.4845   Max.   :62.11summary(time.Active.clean)##    timecycle          timewalk    
##  Min.   : 0.2914   Min.   : 1.00  
##  1st Qu.: 2.9141   1st Qu.:10.00  
##  Median : 4.3711   Median :15.00  
##  Mean   : 4.5852   Mean   :16.10  
##  3rd Qu.: 5.8282   3rd Qu.:20.00  
##  Max.   :17.4845   Max.   :62.11ggplot(data = time.Active.clean) + 
  geom_area(aes(x = timecycle),
            stat = "bin", 
            binwidth = 5, 
            fill = "blue", 
            color = "blue", 
            alpha = 0.6) +
  geom_area(aes(x = timewalk), 
            stat = "bin", 
            binwidth = 5, 
            fill = "yellow", 
            color = "yellow", 
            alpha = 0.6)# Initialize a `ggplot` object that will use table `time.Active.clean` 
# as an input, and name it `p`
p <- ggplot(data = time.Active.clean)# By typing the name of a ggplot object, the default 
# behavior is to render it
pp + 
  # Add a geometric object of type area to the plot
  # Map the variable `timecycle` to the x-axis. Notice
  # that the y-axis is a calculated statistic, the count
  # of cases (returned by stat =bin), so we do not need
  # to specify it
  geom_area(aes(x = timecycle), 
            stat = "bin", 
            # The bindwidth controls the size of the bins
            # needed to count the number of cases at levels
            # of the variable mapped to the x-axis
            binwidth = 5)p + 
  geom_area(aes(x = timecycle), 
            stat = "bin", 
            binwidth = 5, 
            # fill controls the color of the polygon
            fill = "blue", 
            # color controls the color of the perimeter
            # of the polygon or of lines more generally
            color = "black", 
            alpha = 0.6)ggplot(data = time.Active.clean) + 
  geom_area(aes(x = timecycle),
            stat = "bin",
            binwidth = 5,
            fill = "blue", 
            color = "black", 
            alpha = 0.6) +
  # We can plot a second geometric element to the x-axis
  # using a different variable from the same table
  geom_area(aes(x = timewalk),
            stat = "bin", 
            binwidth = 5,
            fill = "yellow",
            color = "black",
            alpha = 0.6)ggplot(data = time.Active.clean) + 
  geom_area(aes(x = timecycle),
            stat = "bin",
            binwidth = 5, 
            fill = "blue", 
            color = "black", 
            alpha = 0.6) +
  geom_area(aes(x = timewalk), 
            stat = "bin", 
            binwidth = 5, 
            fill = "yellow", 
            color = "black",
            alpha = 0.6) +
  xlab("Time (in minutes)")# The pipe operator `%>%` takes an object and passes it on
# to the next function where it is used as the first argument
mc_mode_choice %>% 
  # `select()` retrieves columns from a data frame
  select(c("choice", "side_den")) %>%
  summary()##    choice       side_den    
##  Cycle: 48   Min.   : 0.00  
##  Walk :711   1st Qu.:18.19  
##  HSR  :336   Median :22.63  
##  Car  :281   Mean   :24.18  
##              3rd Qu.:35.70  
##              Max.   :59.41# Pipe the table to `ggplot()` where it is assumed to be the
# first argument of the function, i.e., data
mc_mode_choice %>%
  # Map `choice` to the x-axis and `side_den` to the y-axis
ggplot(aes(x = choice, 
           y = side_den)) + 
  # Add a geometric object of type boxplot
  geom_boxplot()library(mlogit)
data("Mode")