Automatically recode categorical variables in a data frame
Source:R/dta_recode_auto.R
dta_recode_auto.Rd
dta_recode_auto()
automatically recodes categorical variables in
a data frame or tibble. That is, it assigns numerical codes starting
from 1 and increasing by 1 (i.e. 1, 2, 3, 4, ...) to the labels
(categories) according to their alphabetic order.
Usage
dta_recode_auto(
dat,
.columns = names(dat),
min_categories = 2,
max_categories = 25,
is_ordered = FALSE,
as_numeric = FALSE
)
Arguments
- dat
A data frame containing the variables to be recoded.
- .columns
A character vector specifying which columns to recode. Default is
NULL
, meaning all applicable columns will be recoded.- min_categories
The minimum number of unique values to consider for recoding to happen. Default is
2
.- max_categories
The maximum number of unique values to consider afor recoding to happen. Default is
25
.- is_ordered
Logical. If
TRUE
, creates an ordered factor. Default isFALSE
.- as_numeric
Logical. If
TRUE
, converts the result of recoding to numeric, otherwise, it keeps them as factors. Default isFALSE
.
Details
The function automatically recodes categorical variables in the data frame
that have a number of unique values between min_categories
and
max_categories
. It assigns a numeric code to each unique category
based on their alphabetical order. For instance, if a column represents
the variable Gender
with categories Female
and Male
,
the function assigns 1
to Female
and 2
to Male
.
This recoding is done using the factor
function, where the levels of
the factor are ordered alphabetically, and then converted to numeric values.
This ensures that the recoding is consistent and follows a lexicographical
order for the categorical variables.
Examples
library(dplyr)
data("data_sample")
glimpse(data_sample) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes…
#> $ python <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No", "N…
#> $ sas <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", …
#> $ stata <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ spss <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No"…
#> $ excel <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "No", …
# Auto-recode all categorical variables
result <- dta_recode_auto(dat = data_sample)
glimpse(result) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <fct> Central, Central, South, West, North East, North East, …
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <fct> 50-59, 40-49, 40-49, 30-39, 40-49, 50-59, 50-59, 30-39,…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <fct> AB, B, AB, AB, A, A, AB, B, A, AB, AB, A, B, AB, A, B, …
#> $ marital_status <fct> Married, Married, Married, Single, Single, Married, Sin…
#> $ education <fct> Bachelors, Bachelors, Bachelors, Bachelors, Bachelors, …
#> $ employed <fct> Yes, No, No, Yes, Yes, No, Yes, No, Yes, Yes, Yes, No, …
#> $ ses <fct> Middle, Middle, High, Middle, Low, Middle, Low, Low, Mi…
#> $ language <fct> Mandarin, French, Arabic, English, Arabic, Mandarin, En…
#> $ phone <fct> OnePlus, OnePlus, Samsung, OnePlus, OnePlus, Samsung, O…
#> $ transport <fct> Bicycle, Train, Car, Bus, Bus, Bus, Bus, Train, Bicycle…
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…
# Convert to numeric
result <- dta_recode_auto(
dat = data_sample, as_numeric = TRUE
)
glimpse(result) # look at the data type and values columns
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <dbl> 1, 1, 2, 3, 4, 4, 1, 4, 1, 2, 4, 4, 2, 4, 4, 3, 2, 2, 2…
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <dbl> 1, 2, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 4, 3, 2, 5, 5, 3, 4…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <dbl> 1, 2, 1, 1, 3, 3, 1, 2, 3, 1, 1, 3, 2, 1, 3, 2, 1, 1, 1…
#> $ marital_status <dbl> 1, 1, 1, 2, 2, 1, 2, 2, 3, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1…
#> $ education <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 2, 1…
#> $ employed <dbl> 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1…
#> $ ses <dbl> 1, 1, 2, 1, 3, 1, 3, 3, 1, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3…
#> $ language <dbl> 1, 2, 3, 4, 3, 1, 4, 1, 1, 5, 3, 4, 5, 3, 3, 5, 5, 3, 5…
#> $ phone <dbl> 1, 1, 2, 1, 1, 2, 1, 1, 3, 3, 4, 4, 2, 4, 4, 4, 4, 5, 6…
#> $ transport <dbl> 1, 2, 3, 4, 4, 4, 4, 2, 1, 5, 1, 3, 1, 5, 2, 3, 4, 3, 3…
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2…
#> $ python <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1…
#> $ sas <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
#> $ stata <dbl> 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1…
#> $ spss <dbl> 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1…
#> $ excel <dbl> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2…
# Specify the columns using the `.columns` argument
result2 <- dta_recode_auto(
dat = data_sample, .columns = r:excel
)
glimpse(result2) # only the variables `r` through `excel` will be recoded
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…