Automatically recode categorical variables in a data frame

dta_recode_auto() automatically recodes categorical variables in a data frame or tibble. That is, it assigns numerical codes starting from 1 and increasing by 1 (i.e. 1, 2, 3, 4, ...) to the labels (categories) according to their alphabetic order.

Usage

dta_recode_auto(
  dat,
  .columns = names(dat),
  min_categories = 2,
  max_categories = 25,
  is_ordered = FALSE,
  as_numeric = FALSE
)

Arguments

dat: A data frame containing the variables to be recoded.
.columns: A character vector specifying which columns to recode. Default is NULL, meaning all applicable columns will be recoded.
min_categories: The minimum number of unique values to consider for recoding to happen. Default is 2.
max_categories: The maximum number of unique values to consider afor recoding to happen. Default is 25.
is_ordered: Logical. If TRUE, creates an ordered factor. Default is FALSE.
as_numeric: Logical. If TRUE, converts the result of recoding to numeric, otherwise, it keeps them as factors. Default is FALSE.

Value

A data frame with recoded categorical variables.

Details

The function automatically recodes categorical variables in the data frame that have a number of unique values between min_categories and max_categories. It assigns a numeric code to each unique category based on their alphabetical order. For instance, if a column represents the variable Gender with categories Female and Male, the function assigns 1 to Female and 2 to Male. This recoding is done using the factor function, where the levels of the factor are ordered alphabetically, and then converted to numeric values. This ensures that the recoding is consistent and follows a lexicographical order for the categorical variables.

Examples

library(dplyr)
data("data_sample")
glimpse(data_sample) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education      <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed       <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses            <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language       <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone          <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport      <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes…
#> $ python         <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No", "N…
#> $ sas            <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", …
#> $ stata          <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ spss           <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No"…
#> $ excel          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "No", …

# Auto-recode all categorical variables

result <- dta_recode_auto(dat = data_sample)
glimpse(result) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <fct> Central, Central, South, West, North East, North East, …
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <fct> 50-59, 40-49, 40-49, 30-39, 40-49, 50-59, 50-59, 30-39,…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <fct> AB, B, AB, AB, A, A, AB, B, A, AB, AB, A, B, AB, A, B, …
#> $ marital_status <fct> Married, Married, Married, Single, Single, Married, Sin…
#> $ education      <fct> Bachelors, Bachelors, Bachelors, Bachelors, Bachelors, …
#> $ employed       <fct> Yes, No, No, Yes, Yes, No, Yes, No, Yes, Yes, Yes, No, …
#> $ ses            <fct> Middle, Middle, High, Middle, Low, Middle, Low, Low, Mi…
#> $ language       <fct> Mandarin, French, Arabic, English, Arabic, Mandarin, En…
#> $ phone          <fct> OnePlus, OnePlus, Samsung, OnePlus, OnePlus, Samsung, O…
#> $ transport      <fct> Bicycle, Train, Car, Bus, Bus, Bus, Bus, Train, Bicycle…
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python         <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas            <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata          <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss           <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel          <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…

# Convert to numeric

result <- dta_recode_auto(
  dat = data_sample, as_numeric = TRUE
)
glimpse(result) # look at the data type and values columns
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <dbl> 1, 1, 2, 3, 4, 4, 1, 4, 1, 2, 4, 4, 2, 4, 4, 3, 2, 2, 2…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <dbl> 1, 2, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 4, 3, 2, 5, 5, 3, 4…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <dbl> 1, 2, 1, 1, 3, 3, 1, 2, 3, 1, 1, 3, 2, 1, 3, 2, 1, 1, 1…
#> $ marital_status <dbl> 1, 1, 1, 2, 2, 1, 2, 2, 3, 1, 1, 2, 1, 2, 2, 2, 1, 1, 1…
#> $ education      <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 2, 1…
#> $ employed       <dbl> 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1…
#> $ ses            <dbl> 1, 1, 2, 1, 3, 1, 3, 3, 1, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3…
#> $ language       <dbl> 1, 2, 3, 4, 3, 1, 4, 1, 1, 5, 3, 4, 5, 3, 3, 5, 5, 3, 5…
#> $ phone          <dbl> 1, 1, 2, 1, 1, 2, 1, 1, 3, 3, 4, 4, 2, 4, 4, 4, 4, 5, 6…
#> $ transport      <dbl> 1, 2, 3, 4, 4, 4, 4, 2, 1, 5, 1, 3, 1, 5, 2, 3, 4, 3, 3…
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2…
#> $ python         <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1…
#> $ sas            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
#> $ stata          <dbl> 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1…
#> $ spss           <dbl> 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1…
#> $ excel          <dbl> 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2…

# Specify the columns using the `.columns` argument

result2 <- dta_recode_auto(
  dat = data_sample, .columns = r:excel
)
glimpse(result2) # only the variables `r` through `excel` will be recoded
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education      <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed       <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses            <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language       <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone          <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport      <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python         <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas            <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata          <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss           <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel          <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…