dta_recode()
recodes variables in a dat
using a dictionary
dict
. The dictionary maps the original values of each variable to
their new values and labels.
Usage
dta_recode(
dat,
dict,
sheet = 1,
min_categories = 2,
max_categories = 25,
as_numeric = FALSE,
is_force_sequential = FALSE
)
Arguments
- dat
A data frame or tibble containing the variables to be recoded.
- dict
A data frame or tibble serving as the dictionary, specifying variable names, values, and labels.
- sheet
The name or index of the worksheet that contains the data for the dictionary.
- min_categories
Minimum number of categories for a variable to be recoded. Defaults to
1
.- max_categories
Maximum number of categories for a variable to be recoded. Defaults to
25
.- as_numeric
Logical. If
TRUE
, the recoded variables are returned as numeric. Defaults toFALSE
.- is_force_sequential
Logical indicating whether or not to force sequential values, that is, they should start at 1 and increase by 1.
Value
A tibble with recoded variables. If warnings are generated, they are saved to a CSV file and displayed.
Examples
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
data("data_sample")
glimpse(data_sample) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes…
#> $ python <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No", "N…
#> $ sas <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", …
#> $ stata <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ spss <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No"…
#> $ excel <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "No", …
data("dict_recode")
dta_gtable(dict_recode)
# The default nature of `dta_recode()` is to drop the
# labels if the values are not sequential or do not
# start at 1. To maintain these labels, set
# `is_force_sequential` to `TRUE`. Note that this will
# reset the given values to sequential.
result2 <- dta_recode(
dat = data_sample,
dict = dict_recode,
is_force_sequential = TRUE
)
glimpse(result2)
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <fct> Central, Central, South, West, North East, North East, …
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <ord> 50-59, 40-49, 40-49, 30-39, 40-49, 50-59, 50-59, 30-39,…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <fct> AB, B, AB, AB, A, A, AB, B, A, AB, AB, A, B, AB, A, B, …
#> $ marital_status <fct> Married, Married, Married, Single, Single, Married, Sin…
#> $ education <ord> Bachelors, Bachelors, Bachelors, Bachelors, Bachelors, …
#> $ employed <fct> Yes, No, No, Yes, Yes, No, Yes, No, Yes, Yes, Yes, No, …
#> $ ses <ord> Middle, Middle, High, Middle, Low, Middle, Low, Low, Mi…
#> $ language <fct> Mandarin, French, Arabic, English, Arabic, Mandarin, En…
#> $ phone <fct> OnePlus, OnePlus, Samsung, OnePlus, OnePlus, Samsung, O…
#> $ transport <fct> Bicycle, Train, Car, Bus, Bus, Bus, Bus, Train, Bicycle…
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…
# Return numeric codes
result3 <- dta_recode(
dat = data_sample,
dict = dict_recode,
as_numeric = TRUE
)
glimpse(result3) # look at the data type column and values
#> Rows: 2,500
#> Columns: 21
#> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region <dbl> 1, 1, 3, 4, 2, 2, 1, 2, 1, 3, 2, 2, 3, 2, 2, 4, 3, 3, 3…
#> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group <dbl> 4, 3, 3, 2, 3, 4, 4, 2, 4, 2, 3, 3, 1, 2, 3, 5, 5, 2, 1…
#> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group <dbl> 3, 2, 3, 3, 1, 1, 3, 2, 1, 3, 3, 1, 2, 3, 1, 2, 3, 3, 3…
#> $ marital_status <dbl> 2, 2, 2, 1, 1, 2, 1, 1, 3, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2…
#> $ education <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 2, 1…
#> $ employed <dbl> 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2…
#> $ ses <dbl> 2, 2, 3, 2, 1, 2, 1, 1, 2, 1, 2, 1, 3, 3, 1, 2, 1, 1, 1…
#> $ language <dbl> 5, 2, 4, 1, 4, 5, 1, 5, 5, 3, 4, 1, 3, 4, 4, 3, 3, 4, 3…
#> $ phone <dbl> 5, 5, 2, 5, 5, 2, 5, 5, 6, 6, 4, 4, 2, 4, 4, 4, 4, 3, 7…
#> $ transport <dbl> 2, 5, 3, 4, 4, 4, 4, 5, 2, 1, 2, 3, 2, 1, 5, 3, 4, 3, 3…
#> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2…
#> $ python <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1…
#> $ sas <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
#> $ stata <dbl> 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1…
#> $ spss <dbl> 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1…
#> $ excel <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…