Skip to contents

dta_recode() recodes variables in a dat using a dictionary dict. The dictionary maps the original values of each variable to their new values and labels.

Usage

dta_recode(
  dat,
  dict,
  sheet = 1,
  min_categories = 2,
  max_categories = 25,
  as_numeric = FALSE,
  is_force_sequential = FALSE
)

Arguments

dat

A data frame or tibble containing the variables to be recoded.

dict

A data frame or tibble serving as the dictionary, specifying variable names, values, and labels.

sheet

The name or index of the worksheet that contains the data for the dictionary.

min_categories

Minimum number of categories for a variable to be recoded. Defaults to 1.

max_categories

Maximum number of categories for a variable to be recoded. Defaults to 25.

as_numeric

Logical. If TRUE, the recoded variables are returned as numeric. Defaults to FALSE.

is_force_sequential

Logical indicating whether or not to force sequential values, that is, they should start at 1 and increase by 1.

Value

A tibble with recoded variables. If warnings are generated, they are saved to a CSV file and displayed.

Examples

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
data("data_sample")
glimpse(data_sample) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education      <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed       <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses            <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language       <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone          <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport      <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes…
#> $ python         <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No", "N…
#> $ sas            <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", …
#> $ stata          <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ spss           <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No"…
#> $ excel          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "No", …

data("dict_recode")
dta_gtable(dict_recode)
names values labels is_ordered
region 1 Central 0

2 North East

3 South

4 West
age_group 1 20-29 1

2 30-39

3 40-49

4 50-59

5 60-69

6 70+
blood_group 1 A 0

2 B

3 AB

4 O
marital_status 1 Single 0

2 Married

3 Other
education 1 Bachelors 1

2 Masters

3 Doctorate
employed 0 No 0

1 Yes
ses 1 Low 1

2 Middle

3 High
language 1 English 0

2 French

3 Spanish

4 Arabic

5 Mandarin

6 Other
phone 0 None 0

1 Samsung

2 Apple

3 Xiaomi

4 OnePlus

5 Google

6 Other
transport 1 Walking 0

2 Bicycle

3 Car

4 Bus

5 Train
r 0 No 0

1 Yes
python 0 No 0

1 Yes
sas 0 No 0

1 Yes
stata 0 No 0

1 Yes
spss 0 No 0

1 Yes
excel 0 No 0

1 Yes
# The default nature of `dta_recode()` is to drop the # labels if the values are not sequential or do not # start at 1. To maintain these labels, set # `is_force_sequential` to `TRUE`. Note that this will # reset the given values to sequential. result2 <- dta_recode( dat = data_sample, dict = dict_recode, is_force_sequential = TRUE ) glimpse(result2) #> Rows: 2,500 #> Columns: 21 #> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59… #> $ region <fct> Central, Central, South, West, North East, North East, … #> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,… #> $ age_group <ord> 50-59, 40-49, 40-49, 30-39, 40-49, 50-59, 50-59, 30-39,… #> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1… #> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,… #> $ blood_group <fct> AB, B, AB, AB, A, A, AB, B, A, AB, AB, A, B, AB, A, B, … #> $ marital_status <fct> Married, Married, Married, Single, Single, Married, Sin… #> $ education <ord> Bachelors, Bachelors, Bachelors, Bachelors, Bachelors, … #> $ employed <fct> Yes, No, No, Yes, Yes, No, Yes, No, Yes, Yes, Yes, No, … #> $ ses <ord> Middle, Middle, High, Middle, Low, Middle, Low, Low, Mi… #> $ language <fct> Mandarin, French, Arabic, English, Arabic, Mandarin, En… #> $ phone <fct> OnePlus, OnePlus, Samsung, OnePlus, OnePlus, Samsung, O… #> $ transport <fct> Bicycle, Train, Car, Bus, Bus, Bus, Bus, Train, Bicycle… #> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm… #> $ r <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N… #> $ python <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, … #> $ sas <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No… #> $ stata <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, … #> $ spss <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No… #> $ excel <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N… # Return numeric codes result3 <- dta_recode( dat = data_sample, dict = dict_recode, as_numeric = TRUE ) glimpse(result3) # look at the data type column and values #> Rows: 2,500 #> Columns: 21 #> $ id <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59… #> $ region <dbl> 1, 1, 3, 4, 2, 2, 1, 2, 1, 3, 2, 2, 3, 2, 2, 4, 3, 3, 3… #> $ age <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,… #> $ age_group <dbl> 4, 3, 3, 2, 3, 4, 4, 2, 4, 2, 3, 3, 1, 2, 3, 5, 5, 2, 1… #> $ height <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1… #> $ weight <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,… #> $ blood_group <dbl> 3, 2, 3, 3, 1, 1, 3, 2, 1, 3, 3, 1, 2, 3, 1, 2, 3, 3, 3… #> $ marital_status <dbl> 2, 2, 2, 1, 1, 2, 1, 1, 3, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2… #> $ education <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 2, 1… #> $ employed <dbl> 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2… #> $ ses <dbl> 2, 2, 3, 2, 1, 2, 1, 1, 2, 1, 2, 1, 3, 3, 1, 2, 1, 1, 1… #> $ language <dbl> 5, 2, 4, 1, 4, 5, 1, 5, 5, 3, 4, 1, 3, 4, 4, 3, 3, 4, 3… #> $ phone <dbl> 5, 5, 2, 5, 5, 2, 5, 5, 6, 6, 4, 4, 2, 4, 4, 4, 4, 3, 7… #> $ transport <dbl> 2, 5, 3, 4, 4, 4, 4, 5, 2, 1, 2, 3, 2, 1, 5, 3, 4, 3, 3… #> $ gadgets_owned <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm… #> $ r <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2… #> $ python <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1… #> $ sas <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1… #> $ stata <dbl> 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1… #> $ spss <dbl> 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1… #> $ excel <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…