Recode variables in a data frame based on a dictionary

dta_recode() recodes variables in a dat using a dictionary dict. The dictionary maps the original values of each variable to their new values and labels.

Usage

dta_recode(
  dat,
  dict,
  sheet = 1,
  min_categories = 2,
  max_categories = 25,
  as_numeric = FALSE,
  is_force_sequential = FALSE
)

Arguments

dat: A data frame or tibble containing the variables to be recoded.
dict: A data frame or tibble serving as the dictionary, specifying variable names, values, and labels.
sheet: The name or index of the worksheet that contains the data for the dictionary.
min_categories: Minimum number of categories for a variable to be recoded. Defaults to 1.
max_categories: Maximum number of categories for a variable to be recoded. Defaults to 25.
as_numeric: Logical. If TRUE, the recoded variables are returned as numeric. Defaults to FALSE.
is_force_sequential: Logical indicating whether or not to force sequential values, that is, they should start at 1 and increase by 1.

Value

A tibble with recoded variables. If warnings are generated, they are saved to a CSV file and displayed.

Examples

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
data("data_sample")
glimpse(data_sample) # look at the data type column
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <chr> "Central", "Central", "South", "West", "North East", "N…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <chr> "50-59", "40-49", "40-49", "30-39", "40-49", "50-59", "…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <chr> "AB", "B", "AB", "AB", "A", "A", "AB", "B", "A", "AB", …
#> $ marital_status <chr> "Married", "Married", "Married", "Single", "Single", "M…
#> $ education      <chr> "Bachelors", "Bachelors", "Bachelors", "Bachelors", "Ba…
#> $ employed       <chr> "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ ses            <chr> "Middle", "Middle", "High", "Middle", "Low", "Middle", …
#> $ language       <chr> "Mandarin", "French", "Arabic", "English", "Arabic", "M…
#> $ phone          <chr> "OnePlus", "OnePlus", "Samsung", "OnePlus", "OnePlus", …
#> $ transport      <chr> "Bicycle", "Train", "Car", "Bus", "Bus", "Bus", "Bus", …
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes…
#> $ python         <chr> "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No", "N…
#> $ sas            <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", …
#> $ stata          <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Ye…
#> $ spss           <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No"…
#> $ excel          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "No", …

data("dict_recode")
dta_gtable(dict_recode)


  names
      values
      labels
      is_ordered
    
region
1
Central
0


2
North East



3
South



4
West

age_group
1
20-29
1


2
30-39



3
40-49



4
50-59



5
60-69



6
70+

blood_group
1
A
0


2
B



3
AB



4
O

marital_status
1
Single
0


2
Married



3
Other

education
1
Bachelors
1


2
Masters



3
Doctorate

employed
0
No
0


1
Yes

ses
1
Low
1


2
Middle



3
High

language
1
English
0


2
French



3
Spanish



4
Arabic



5
Mandarin



6
Other

phone
0
None
0


1
Samsung



2
Apple



3
Xiaomi



4
OnePlus



5
Google



6
Other

transport
1
Walking
0


2
Bicycle



3
Car



4
Bus



5
Train

r
0
No
0


1
Yes

python
0
No
0


1
Yes

sas
0
No
0


1
Yes

stata
0
No
0


1
Yes

spss
0
No
0


1
Yes

excel
0
No
0


1
Yes


# The default nature of `dta_recode()` is to drop the
# labels if the values are not sequential or do not
# start at 1. To maintain these labels, set 
# `is_force_sequential` to `TRUE`. Note that this will
# reset the given values to sequential.

result2 <- dta_recode(
  dat = data_sample,
  dict = dict_recode,
  is_force_sequential = TRUE
)
glimpse(result2)
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <fct> Central, Central, South, West, North East, North East, …
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <ord> 50-59, 40-49, 40-49, 30-39, 40-49, 50-59, 50-59, 30-39,…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <fct> AB, B, AB, AB, A, A, AB, B, A, AB, AB, A, B, AB, A, B, …
#> $ marital_status <fct> Married, Married, Married, Single, Single, Married, Sin…
#> $ education      <ord> Bachelors, Bachelors, Bachelors, Bachelors, Bachelors, …
#> $ employed       <fct> Yes, No, No, Yes, Yes, No, Yes, No, Yes, Yes, Yes, No, …
#> $ ses            <ord> Middle, Middle, High, Middle, Low, Middle, Low, Low, Mi…
#> $ language       <fct> Mandarin, French, Arabic, English, Arabic, Mandarin, En…
#> $ phone          <fct> OnePlus, OnePlus, Samsung, OnePlus, OnePlus, Samsung, O…
#> $ transport      <fct> Bicycle, Train, Car, Bus, Bus, Bus, Bus, Train, Bicycle…
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <fct> No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, No, N…
#> $ python         <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Yes, …
#> $ sas            <fct> No, No, No, No, No, No, No, No, Yes, No, No, No, No, No…
#> $ stata          <fct> No, No, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, …
#> $ spss           <fct> No, No, Yes, No, Yes, No, No, Yes, No, Yes, No, Yes, No…
#> $ excel          <fct> Yes, No, No, No, No, No, No, No, No, No, No, No, Yes, N…

# Return numeric codes

result3 <- dta_recode(
  dat = data_sample,
  dict = dict_recode,
  as_numeric = TRUE
)
glimpse(result3) # look at the data type column and values
#> Rows: 2,500
#> Columns: 21
#> $ id             <chr> "STM/7539", "STM/7993", "STM/7387", "STM/5598", "STM/59…
#> $ region         <dbl> 1, 1, 3, 4, 2, 2, 1, 2, 1, 3, 2, 2, 3, 2, 2, 4, 3, 3, 3…
#> $ age            <dbl> 56, 46, 45, 37, 45, 51, 56, 37, 50, 38, 48, 41, 24, 34,…
#> $ age_group      <dbl> 4, 3, 3, 2, 3, 4, 4, 2, 4, 2, 3, 3, 1, 2, 3, 5, 5, 2, 1…
#> $ height         <dbl> 1.70, 1.57, 1.47, 1.67, 1.69, 1.90, 1.85, 1.64, 1.61, 1…
#> $ weight         <dbl> 73, 53, 85, 77, 53, 75, 69, 53, 56, 89, 73, 86, 76, 81,…
#> $ blood_group    <dbl> 3, 2, 3, 3, 1, 1, 3, 2, 1, 3, 3, 1, 2, 3, 1, 2, 3, 3, 3…
#> $ marital_status <dbl> 2, 2, 2, 1, 1, 2, 1, 1, 3, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2…
#> $ education      <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 3, 2, 1…
#> $ employed       <dbl> 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2…
#> $ ses            <dbl> 2, 2, 3, 2, 1, 2, 1, 1, 2, 1, 2, 1, 3, 3, 1, 2, 1, 1, 1…
#> $ language       <dbl> 5, 2, 4, 1, 4, 5, 1, 5, 5, 3, 4, 1, 3, 4, 4, 3, 3, 4, 3…
#> $ phone          <dbl> 5, 5, 2, 5, 5, 2, 5, 5, 6, 6, 4, 4, 2, 4, 4, 4, 4, 3, 7…
#> $ transport      <dbl> 2, 5, 3, 4, 4, 4, 4, 5, 2, 1, 2, 3, 2, 1, 5, 3, 4, 3, 3…
#> $ gadgets_owned  <chr> "Smart TV, Tablet, Desktop Computer, Digital Camera, Sm…
#> $ r              <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2…
#> $ python         <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1…
#> $ sas            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1…
#> $ stata          <dbl> 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1…
#> $ spss           <dbl> 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1…
#> $ excel          <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…

names	values	labels	is_ordered
region	1	Central	0
	2	North East
	3	South
	4	West
age_group	1	20-29	1
	2	30-39
	3	40-49
	4	50-59
	5	60-69
	6	70+
blood_group	1	A	0
	2	B
	3	AB
	4	O
marital_status	1	Single	0
	2	Married
	3	Other
education	1	Bachelors	1
	2	Masters
	3	Doctorate
employed	0	No	0
	1	Yes
ses	1	Low	1
	2	Middle
	3	High
language	1	English	0
	2	French
	3	Spanish
	4	Arabic
	5	Mandarin
	6	Other
phone	0	None	0
	1	Samsung
	2	Apple
	3	Xiaomi
	4	OnePlus
	5	Google
	6	Other
transport	1	Walking	0
	2	Bicycle
	3	Car
	4	Bus
	5	Train
r	0	No	0
	1	Yes
python	0	No	0
	1	Yes
sas	0	No	0
	1	Yes
stata	0	No	0
	1	Yes
spss	0	No	0
	1	Yes
excel	0	No	0
	1	Yes

Recode variables in a data frame based on a dictionary

Usage

Arguments

Value

See also

Examples