23  Expand

Author

Jarad Niemi

R Code Button

In this chapter, we will discuss the basics of wrangling an individual data set.

library("tidyverse")
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4.9000     ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.2          ✔ tibble    3.3.0     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

23.1 Mutate

The mutate() function allows you to create new variables in a data.frame. Typically this involves changing units, performing calculations with other variables, and refactoring factors.

# Mutate can create variables
ToothGrowth |> 
  mutate(length = len) |>
  head()
   len supp dose length
1  4.2   VC  0.5    4.2
2 11.5   VC  0.5   11.5
3  7.3   VC  0.5    7.3
4  5.8   VC  0.5    5.8
5  6.4   VC  0.5    6.4
6 10.0   VC  0.5   10.0
# The underlying data set has not changed
head(ToothGrowth)
   len supp dose
1  4.2   VC  0.5
2 11.5   VC  0.5
3  7.3   VC  0.5
4  5.8   VC  0.5
5  6.4   VC  0.5
6 10.0   VC  0.5
# Save the updated data set
d <- ToothGrowth |> 
  mutate(length = len) 

# New data set includes the new variable
head(d)
   len supp dose length
1  4.2   VC  0.5    4.2
2 11.5   VC  0.5   11.5
3  7.3   VC  0.5    7.3
4  5.8   VC  0.5    5.8
5  6.4   VC  0.5    6.4
6 10.0   VC  0.5   10.0

23.1.1 Numeric

# Mutate numeric
ToothGrowth |>
  mutate(
     `Length (cm)` = len / 10 # `len` is in the data.frame
  ) |>
  summary()                   # `len` is in mm while `Length (cm)` is in cm
      len        supp         dose        Length (cm)   
 Min.   : 4.20   OJ:30   Min.   :0.500   Min.   :0.420  
 1st Qu.:13.07   VC:30   1st Qu.:0.500   1st Qu.:1.308  
 Median :19.25           Median :1.000   Median :1.925  
 Mean   :18.81           Mean   :1.167   Mean   :1.881  
 3rd Qu.:25.27           3rd Qu.:2.000   3rd Qu.:2.527  
 Max.   :33.90           Max.   :2.000   Max.   :3.390  
# Overwrite existing variable
ToothGrowth |>
  mutate(
     len = len / 10 # `len` is in the data.frame
  ) |>
  summary()                   # `len` is in cm
      len        supp         dose      
 Min.   :0.420   OJ:30   Min.   :0.500  
 1st Qu.:1.308   VC:30   1st Qu.:0.500  
 Median :1.925           Median :1.000  
 Mean   :1.881           Mean   :1.167  
 3rd Qu.:2.527           3rd Qu.:2.000  
 Max.   :3.390           Max.   :2.000  
# Use object outside data.frame
mm_per_cm <- 10
ToothGrowth |>
  mutate(
     len = len / mm_per_cm 
  ) |>
  summary()
      len        supp         dose      
 Min.   :0.420   OJ:30   Min.   :0.500  
 1st Qu.:1.308   VC:30   1st Qu.:0.500  
 Median :1.925           Median :1.000  
 Mean   :1.881           Mean   :1.167  
 3rd Qu.:2.527           3rd Qu.:2.000  
 Max.   :3.390           Max.   :2.000  
# Use updated variable
ToothGrowth |>
  mutate(
    # Scale len between 0 and 1
    len = len - min(len),
    len = len / max(len)  # reused `len` here
  ) |>
  summary() # len now ranges between 0 and 1
      len         supp         dose      
 Min.   :0.0000   OJ:30   Min.   :0.500  
 1st Qu.:0.2988   VC:30   1st Qu.:0.500  
 Median :0.5067           Median :1.000  
 Mean   :0.4920           Mean   :1.167  
 3rd Qu.:0.7096           3rd Qu.:2.000  
 Max.   :1.0000           Max.   :2.000  

23.1.2 Categorical

We commonly need to convert between factor and character representations of categorical variables.

# Convert between character and factor
d <- ToothGrowth |>
  select(supp) |>                 # only keep `supp` column
  mutate(
    supp_ch = as.character(supp), # convert to character
    supp_fa = as.factor(supp_ch)  # convert to factor
  ) 

d |> summary() # summary() only informative for factor
 supp      supp_ch          supp_fa
 OJ:30   Length:60          OJ:30  
 VC:30   Class :character   VC:30  
         Mode  :character          
# table() is always informative
d$supp_ch |> table()

OJ VC 
30 30 
d$supp_fa |> table()

OJ VC 
30 30 

At this point, the main distinction between character variables and factor variables is that you can change the order of factor variables while character variables will always be in alphabetical order.

# Change order of factor variable
# Cannot change order of character variables
d |>
  mutate(
    supp_fa = factor(supp_fa, 
                     levels = c(
                       "VC",     # put VC first
                       "OJ"))    # then OJ
  ) |>
  summary()
 supp      supp_ch          supp_fa
 OJ:30   Length:60          VC:30  
 VC:30   Class :character   OJ:30  
         Mode  :character          
# Recode character or factor levels
d |> 
  mutate(
    supp_ch = fct_recode(supp_ch, 
      `Ascorbic Acid` = "VC",
      `Orange Juice`  = "OJ"
    ),
    supp_fa = fct_recode(supp_fa,
      `Ascorbic Acid` = "VC",
      `Orange Juice`  = "OJ"
    )
  ) |>
  summary()
 supp             supp_ch            supp_fa  
 OJ:30   Orange Juice :30   Orange Juice :30  
 VC:30   Ascorbic Acid:30   Ascorbic Acid:30  

23.1.3 Both

Here we will show you how to utilize the mutate() function to perform a number of calculations.

# Diamonds
d <- diamonds |>
  # Rather than precalculating depth 
  # we will calculate depth in a script
  select(-depth) |>
  mutate(
    # Calculate depth
    depth = 2 * z / (x+y), # see ?diamonds for formula
    depth = 100 * depth,   # make depth a percent
    
    # Calculate $/weight
    price_per_carat = price / carat,
    
    # Reorder cut
    cut = factor(cut, 
                 levels = c(
                   "Ideal",
                   "Premium",
                   "Very Good",
                   "Good",
                   "Fair"
                 ))
  )

# View calculated variables
d |>
  select(price_per_carat, depth, cut) |>
  summary()
 price_per_carat     depth               cut       
 Min.   : 1051   Min.   :  0.00   Ideal    :21551  
 1st Qu.: 2478   1st Qu.: 61.04   Premium  :13791  
 Median : 3495   Median : 61.84   Very Good:12082  
 Mean   : 4008   Mean   : 61.74   Good     : 4906  
 3rd Qu.: 4950   3rd Qu.: 62.53   Fair     : 1610  
 Max.   :17829   Max.   :619.28                    
                 NA's   :7                         
# View observations with NA depth
d |>
  filter(is.na(depth)) |> # see filter() below
  select(depth, x, y, z)  # NaN stands for `Not a number`
# A tibble: 7 × 4
  depth     x     y     z
  <dbl> <dbl> <dbl> <dbl>
1   NaN     0     0     0
2   NaN     0     0     0
3   NaN     0     0     0
4   NaN     0     0     0
5   NaN     0     0     0
6   NaN     0     0     0
7   NaN     0     0     0

23.2 Rename

We can use the rename function to rename an existing variable in the data set. If we want to use names that are not valid R object names, we need to enclose the name using backticks.

# Initial names
ToothGrowth |> 
  names()
[1] "len"  "supp" "dose"
# Renamed
ToothGrowth |>
  rename(
    `Dose (mg/day)` = dose,
    Supplement      = supp,
    `Length (mm)`   = len
  ) |>
  names()
[1] "Length (mm)"   "Supplement"    "Dose (mg/day)"
# We didn't actually change the object so the names in the ToothGrowth data set has not changed.
ToothGrowth |>
  names()
[1] "len"  "supp" "dose"
# Save the renamed data set
d <- ToothGrowth |>
  rename(
    `Dose (mg/day)` = dose,
    Supplement      = supp,
    `Length (mm)`   = len
  ) 

# Saved data set has the new names
d |> 
  names()
[1] "Length (mm)"   "Supplement"    "Dose (mg/day)"