22 Reduce

Author

Jarad Niemi

In this chapter, we will discuss the basics of reducing the size of your data set by selecting a subset of columns or filtering a subset of the rows. In addition, we can slice to randomly select a subset of the rows.

library("tidyverse")

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4.9000     ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.2          ✔ tibble    3.3.0     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

22.1 Select

The select() function allows you to keep or remove certain variables.

# Remind ourselves of the variables in ToothGrowth
head(ToothGrowth)

   len supp dose
1  4.2   VC  0.5
2 11.5   VC  0.5
3  7.3   VC  0.5
4  5.8   VC  0.5
5  6.4   VC  0.5
6 10.0   VC  0.5

# select does not change the underlying data set
ToothGrowth |>
  select(len, supp) |>
  head()

   len supp
1  4.2   VC
2 11.5   VC
3  7.3   VC
4  5.8   VC
5  6.4   VC
6 10.0   VC

# ToothGrowth has not changed
head(ToothGrowth)

   len supp dose
1  4.2   VC  0.5
2 11.5   VC  0.5
3  7.3   VC  0.5
4  5.8   VC  0.5
5  6.4   VC  0.5
6 10.0   VC  0.5

# Save the result
d <- ToothGrowth |>
  select(len, supp)

# d only has len and supp
head(d)

   len supp
1  4.2   VC
2 11.5   VC
3  7.3   VC
4  5.8   VC
5  6.4   VC
6 10.0   VC

There are a variety of ways to select to determine which variables to keep or remove.

# Keep variables by name
ToothGrowth |>
  select(len, supp) |>
  names()

[1] "len"  "supp"

# Remove variables by name
ToothGrowth |>
  select(-len, -supp) |> 
  names()

[1] "dose"

# Keep variables in a range
diamonds |>
  select(price:cut) |>
  names()

[1] "price"   "table"   "depth"   "clarity" "color"   "cut"

There are a variety of helper functions to use with the select() function. Take a look at ?select for more details.

# Select helper functions
diamonds |>
  select(starts_with("c")) |>
  names()

[1] "carat"   "cut"     "color"   "clarity"

diamonds |>
  select(x:last_col()) |>
  names()

[1] "x" "y" "z"

diamonds |>
  select(contains("y")) |>
  names()

[1] "clarity" "y"

You can also use select() to reorder columns.

# Reorder rows
ToothGrowth |> 
  select(len, dose, supp) |>
  head()

   len dose supp
1  4.2  0.5   VC
2 11.5  0.5   VC
3  7.3  0.5   VC
4  5.8  0.5   VC
5  6.4  0.5   VC
6 10.0  0.5   VC

names(diamonds)

 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
 [8] "x"       "y"       "z"

diamonds |>
  select(carat, price,    # carat first then price
         everything()) |> # all other columns (in order)
  names()

 [1] "carat"   "price"   "cut"     "color"   "clarity" "depth"   "table"  
 [8] "x"       "y"       "z"

22.1.1 Pull

Data pipelines work the best when functions return a data.frame as the other functions in this chapter do. If you want to investigate a single variable, you can use the pull() function. This is equivalent to the $ access of a column, but can be included in a dplyr pipeline.

# Pull a variable
ToothGrowth |>
  pull(len) |>
  summary()

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   4.20   13.07   19.25   18.81   25.27   33.90

diamonds |>
  pull(cut) |>
  summary()

     Fair      Good Very Good   Premium     Ideal 
     1610      4906     12082     13791     21551

22.2 Filter

The filter() function allows you to keep observations (rows) by some criteria.

# Filter by numeric variable
ToothGrowth |> filter(len <  6)

  len supp dose
1 4.2   VC  0.5
2 5.8   VC  0.5
3 5.2   VC  0.5

ToothGrowth |> filter(len >= 30)

   len supp dose
1 33.9   VC    2
2 32.5   VC    2
3 30.9   OJ    2

ToothGrowth |> 
  filter(dose == 2) |>
  summary()

      len        supp         dose  
 Min.   :18.50   OJ:10   Min.   :2  
 1st Qu.:23.52   VC:10   1st Qu.:2  
 Median :25.95           Median :2  
 Mean   :26.10           Mean   :2  
 3rd Qu.:27.82           3rd Qu.:2  
 Max.   :33.90           Max.   :2

ToothGrowth |> 
  filter(dose != 2) |> 
  summary()

      len         supp         dose     
 Min.   : 4.200   OJ:20   Min.   :0.50  
 1st Qu.: 9.925   VC:20   1st Qu.:0.50  
 Median :15.200           Median :0.75  
 Mean   :15.170           Mean   :0.75  
 3rd Qu.:19.775           3rd Qu.:1.00  
 Max.   :27.300           Max.   :1.00

You can also filter by character (or factor) variables.

# Filter character
ToothGrowth |>
  filter(supp == "OJ") |>
  summary()

      len        supp         dose      
 Min.   : 8.20   OJ:30   Min.   :0.500  
 1st Qu.:15.53   VC: 0   1st Qu.:0.500  
 Median :22.70           Median :1.000  
 Mean   :20.66           Mean   :1.167  
 3rd Qu.:25.73           3rd Qu.:2.000  
 Max.   :30.90           Max.   :2.000

ToothGrowth |>
  filter(supp != "VC") |>
  summary()

      len        supp         dose      
 Min.   : 8.20   OJ:30   Min.   :0.500  
 1st Qu.:15.53   VC: 0   1st Qu.:0.500  
 Median :22.70           Median :1.000  
 Mean   :20.66           Mean   :1.167  
 3rd Qu.:25.73           3rd Qu.:2.000  
 Max.   :30.90           Max.   :2.000

diamonds |>
  filter(cut %in% c("Premium", "Ideal")) |>
  pull(cut) |>
  summary()

     Fair      Good Very Good   Premium     Ideal 
        0         0         0     13791     21551

You can also filter using multiple variables.

# Filter on multiple variables
ToothGrowth |>
  filter(supp == "OJ", dose == 0.5) |>
  summary()

      len        supp         dose    
 Min.   : 8.20   OJ:10   Min.   :0.5  
 1st Qu.: 9.70   VC: 0   1st Qu.:0.5  
 Median :12.25           Median :0.5  
 Mean   :13.23           Mean   :0.5  
 3rd Qu.:16.18           3rd Qu.:0.5  
 Max.   :21.50           Max.   :0.5

diamonds |>
  filter(
    cut %in% c("Premium", "Ideal"),
    carat <= .75,
    color == "D",
    !(clarity %in% c("VS1", "VS2")) # not VS1 or VS2
  ) |>
  select(cut, carat, color, clarity) |>
  summary()

        cut           carat        color       clarity   
 Fair     :   0   Min.   :0.2300   D:1798   SI1    :920  
 Good     :   0   1st Qu.:0.3300   E:   0   SI2    :382  
 Very Good:   0   Median :0.4100   F:   0   VVS2   :298  
 Premium  : 634   Mean   :0.4516   G:   0   VVS1   :166  
 Ideal    :1164   3rd Qu.:0.5400   H:   0   IF     : 26  
                  Max.   :0.7500   I:   0   I1     :  6  
                                   J:   0   (Other):  0

22.3 Slice

The slice functions allow you to subset the data in a variety of ways.

# Top of data.frame
ToothGrowth |> 
  slice_head()

  len supp dose
1 4.2   VC  0.5

# Bottom of data.frame
ToothGrowth |> 
  slice_tail()

  len supp dose
1  23   OJ    2

# Random rows
ToothGrowth |>
  slice_sample(n = 5) # number of rows

   len supp dose
1 17.3   VC  1.0
2  5.2   VC  0.5
3  7.0   VC  0.5
4 23.3   VC  2.0
5 26.4   OJ  1.0

ToothGrowth |>
  slice_sample(prop = 2/60) # proportion of rows

   len supp dose
1 16.5   VC  1.0
2 16.5   OJ  0.5

# Filter 
ToothGrowth |>
  slice_min(
    len, # variable to order data by
    prop = 2 / 60
  )

  len supp dose
1 4.2   VC  0.5
2 5.2   VC  0.5