Data Cleaning in R

Jan 20, 2024

Lecture Outline

Introduction
Tidyverse

dplyr - transforming data frames
lubridate - wrangling dates and times
ggplot2 - visualizing data
tidyr - cleaning data
readr - importing data
purrr - functional programming
stringr - wrangling strings

Final Exercises
Appendix

Introduction

Welcome to another workshop. Today's topic is data cleaning, or wrangling. This can be defined as the act of transforming data in such a way that is conducive to statistical analysis. For data to be clean, it should be without illegible elements, discrepencies, and corruption. Corrupted data may have duplicated or incomplete values, they may have formatting issues or other flaws that may cause errors later on. As shown in the plot above, data cleaning comprises most of the work data scientists engage in. For this reason, it is crucial for data scientists to become experts in cleaning data. The purpose of this workshop is to introduce you to some of the most popular and widely used packages for cleaning data.

Tidyverse

dplyr

Toy data set containing sales information for one week in a coffee shop. coffee_shop.tsv

select() - select columns from your dataset
filter(data, variable_name == "element") - filter out rows that meet your boolean criteria
arrange() - arrange your column data in ascending or descending order
join() - perform left, right, full, and inner joins in R
mutate( ) - adds new variable or rewrites an old variable, within a data frame
summarise( ) - aggregates variables using summary statistics (i.e mean, median)
group_by( ) - groups a data frame by the levels of a specified variable
|> - a pipeline operator that chains a sequence of calculations/tasks
%>% - alternative notation for pipeline operation

#install.packages("tidyverse")
library(tidyverse)
coffee_shop <- read.table("~/Desktop/coffee_shop.tsv", header = TRUE)
View(coffee_shop)

filtered_coffee <- filter(coffee_shop, beverage == "latte")    						# filter()
filtered_coffee
filtered_coffee2 <- filter(coffee_shop, quantity < 40)
filtered_coffee2

grouped_coffee <- group_by(coffee_shop, date)                  						# group_by()
grouped_coffee

only_latte <- select(coffee_shop, quantity)                           					# select()
only_latte
all_but_date <- select(coffee_shop, beverage:price)
all_but_date

summarised_coffee <- summarize(coffee_shop, weekly_sales = sum(quantity * price))                  	# summarize()
summarised_coffee

mutated_coffee <- mutate(coffee_shop, sales = quantity * price)    		 			# mutate()
mutated_coffee

arranged_coffee <- arrange(mutated_coffee, sales)    # arrange()
arranged_coffee
arranged_coffee_desc <- arrange(mutated_coffee, desc(sales))    
arranged_coffee_desc

only_mocha <- filter(mutated_coffee, beverage == "mocha")
only_mocha


######## Pipelines ########
pipe_ex <- 1:10
pipe_ex |> sum()
pipe_ex %>% sum()
pipe_ex |> mean()
pipe_ex |> order(decreasing = TRUE)

coffee_shop |> 
  mutate(sales = quantity * price) |>
  filter(beverage == "latte")

coffee_shop |> 
  mutate(sales = quantity * price) |>
  filter(beverage == "latte") |>
  summarise(weekly_latte_sales = sum(sales))

Checkpoint 1: General time to play around with the starwars data set

Challenge: Determine the BMI for each human. Hint: BMI = mass / (height/100)^2

lubridate

year/month/day() - extracts components of an inputted date
ymd() - converts string in YYYY-MM-DD format to date type
as.Date() - converts string to date type

class(coffee_shop$date)
coffee_shop$date <- as.Date(coffee_shop$date)						# converting
coffee_shop$date <- ymd(coffee_shop$date)						
class(coffee_shop$date)

year <- year(coffee_shop$date)								# extracting
year
month <- month(coffee_shop$date)
month
day <- day(coffee_shop$date)
day

my_day <- as.Date("2024-01-25")
my_day
next_month <- my_day + months(1) 							# arithmetic
next_month
next_year <- my_day + years(1)
next_year

start_date <- ymd("2024-01-01")
end_date <- ymd("2024-01-31")
days_between <- as.numeric(end_date - start_date)
days_between

ggplot2

ggplot(data, aes()) - creates powerful plotting interface
+ geom_point() - scatter plot
+ geom_bar() - bar plot
+ geom_smooth() - LOESS
+ geom_hist() - histogram
+ annotate() - manually adds text to plot
... and many other functions that can be found here

ggplot(coffee_shop, aes(x = date, y = quantity)) +    			# scatter plot
  geom_point()

ggplot(coffee_shop, aes(x = date, y = quantity)) +    			# line graph
  geom_line()

ggplot(coffee_shop, aes(x = date, y = quantity)) +    			# LOESS plot
  geom_smooth()


daily_sold <- coffee_shop |> 
  group_by(date) |>
  summarise(quantity = sum(quantity))
View(daily_sold)

ggplot(daily_sold, aes(x = date, y = quantity)) +    			# bar plot
  geom_bar(stat = "identity")

View(starwars)
ggplot(starwars, aes(height)) +     					# histogram
  geom_histogram()


############# Customizing ############# 
line_graph <- ggplot(coffee_shop, aes(x = date, y = quantity)) +    
  geom_line(col = "salmon", linewidth = 2) +
  labs(x = "Date", y = "Quantity", title = "Total # of Beverages Sold Per Day")
line_graph

line_graph +
  annotate(geom="text", x = ymd("2024-01-04"), y = 50, label = "DataFest", col = 6, cex = 5)
line_graph + theme_bw()
line_graph + theme_dark()


bar_plot <- ggplot(daily_sold, aes(x = date, y = quantity)) +
  geom_bar(stat = "identity", 
           fill = 1:7)
bar_plot
bar_plot + coord_flip()

Checkpoint 2: Create a bar plot that showws total number of lattes sold each day

Challenge: Create a line graph showing the solution to the following system of equations: \begin{cases} y = 2x^{2} + 3x - 5 \\ y = sin(x) + \frac{1}{2}x \\ \end{cases}

tidyr

gather() - converts df to long stretched out format, using column names as repeated elements
spread() - this takes two columns and “spreads” them into multiple columns
separate() - splits a single column into numerous columns - opposite of unite()
unite() - combines two or more columns into one - opposite of separate()
drop_na() - removes all rows containing an NA value
replace_na() - replaces all rows containing an NA value, must input replacements as a list

################ gather() ################
gathered_df <- coffee_shop |>
  gather(Cols, Elements, 
         beverage:price)
gathered_df


################ spread() ################ 
spread_df <- coffee_shop |>
                spread(beverage, quantity)
spread_df

spread_df |> drop_na()        									# drop_na()
spread_df |> replace_na(list(latte = 0, mocha = 0))         					# replace_na()


################ separate() ################ 
sep_df <- data.frame(Type = c("team.A", "team.B", "party.C", "crew.D", "party.E"))
sep_df

sep_df |> 
  separate(Type, c("Group Type", "Letter"))

sep_df |>
  separate(Type, c(NA, "Letter"))


################ unite() ################ 
names_df <- data.frame(First = c("Harry", "James", "Stephen", "Michael", "Taylor"),
                       Last = c("Styles", "Bond", "Hawking", "Buble", "Swift"),
                       Age = c(29, 37, 76, 48, 34))
names_df

united_df <- names_df |>
  unite("Full Name", First, Last, sep = " ")
united_df

names_df |>
  unite("Full Name", Last, First, sep = ", ")

## How would we reverse unite?
united_df |>
  separate("Full Name", c("First", "Last"), sep = " ")

readr

read_csv("path") - imports .csv file, input is the file path
read_table() - whitespace-separated files
col_types() - specifies desired column class types: character, factor, integer, etc

read_table("~/Desktop/coffee_shop.tsv")

read_table(
  "~/Desktop/Portfolio/dataCleaning/coffee_shop.tsv",
  col_types = cols(
      date      = col_date(format = ""),
      beverage  = col_factor(levels = c("latte", "mocha")),
      quantity  = col_integer(),
      price     = col_double()
  )
)

purrr

map() - apply function to a list or atomic vector
split() - divides vector into groups/tibbles
\(x) - shorthand notation for defining a function (not specific to purrr package)

coffee_shop
coffee_shop |>                  						# split()
  split(coffee_shop$beverage)

## Quick Function Review ##
double_func1 <- function(x) x * 2
double_func2 <- \(x) x * 2
double_func1(15)
double_func2(15)

map(5, double_func1)
map(5, \(x) x * 2)        							# map()
5 |> map(\(x) x^2)

## What if we wanted to predict the # of lattes and mochas sold Jan 8th?
coffee_shop |>
  split(coffee_shop$beverage) |>
  map(\(df) lm(quantity ~ date, data = df)) |>
  map(\(model) predict(model, newdata = data.frame(date = ymd("2024-01-08"))))

stringr

str_sub() - extract substrings
str_length() - return string length
str_split() - split string into list based on specified pattern
str_count() - return occurance frequency of specified pattern
str_detect() - return TRUE/FALSE depending on the existence of a specified pattern within string
str_locate() - returns start/end indeces of specified pattern within string
str_replace() - replaces pattern with specified alternative string
select(ends_with("")) - enables more specific column selection
str_to_lower()/str_to_upper() - converts the string into upper/lower case

a_string <- "Hello! I hope you're enjoying this workshop."
str_sub(a_string, start = 1, end = 5)       
str_length(a_string)       # length
str_split(a_string, pattern = " ")       
str_count(a_string, "o")       
str_count(a_string, "[oab]")
str_detect(a_string, "joy")
str_locate(a_string, "you")
str_replace(a_string, "you", "we")
str_to_upper(a_string)

s <- str_split(a_string, pattern = " ")[[1]]
str_c(s, collapse = " ")

names(starwars)
starwars |>
  select(ends_with("color"))

Final Exercise

Download the current market data for Ethereum from this website: Investing.com
Read in the file
Clean the data, specifically:

Convert Date column to date type
Transform the Vol. column to the correct numeric values - look out for 'K' and 'M' units
Transform the Change % to numeric type

Predict the USD/ETH price for next month
Plot the results using ggplot

Appendix

Exercise Solutions

Star Wars BMI

starwars |>
  filter(species == "Human") |>
  mutate(bmi = mass / (height / 100) ^ 2) |>
  select(name, bmi)

Lattes Sold Each Day

latte_by_day <- coffee_shop |>
  filter(beverage == "latte")

ggplot(latte_by_day, aes(x = date, y = quantity)) +
  geom_bar(stat = "identity", fill = 1:7, alpha = 0.7) +
  labs(x = "Day", y = "Lattes Sold", title = "Total # of Lattes Sold Per Day") + 
  theme_bw()

Plotting Solutions to a System of Equations

x = seq(-10, 10, 0.1)
y1 = 2*x^2 + 3*x - 5
y2 = sin(x) + x/2
sys_eq_df <- data.frame(x, y1, y2)
ysol <- y2[which(round(y1, 1) == round(y2, 1))]
xsol <- x[which(round(y1, 1) == round(y2, 1))]

ggplot(sys_eq_df, aes(x1)) +
  geom_line(aes(y = y1, col = "salmon"), linewidth = 1) +
  geom_line(aes(y = y2, col = "skyblue"), linewidth = 1) +
  theme_bw() + 
  ylim(-10, 10) +
  labs(x = "x", y = "y") +
  scale_color_discrete(name = "Lines", labels = c("y1", "y2")) +
  geom_point(x = xsol[1], y = ysol[1], col = "red", cex = 3) +
  geom_point(x = xsol[2], y = ysol[2], col = "red", cex = 3) +
  annotate(geom="text", x = 3.3, y = 0.3, label="(1.2, 1.53)") + 
  annotate(geom="text", x = -4.1, y = -3, label="(-2.2, -1.91)")

Final Exercise

### Step 1: Read in data
eth_df <- read_csv("~/Desktop/eth.csv",
                   col_types = cols(Date = col_date(format = "%m/%d/%Y"))
                   )

### Step 2: Clean data
k <- str_detect(eth_df$Vol., "K")
m <- str_detect(eth_df$Vol., "M")

eth_df$Vol.[k] <- str_sub(eth_df$Vol.[k], start = 1, end = 6) |>
                  as.numeric() |>
                  map(\(x) x * 1000)

eth_df$Vol.[m] <- str_sub(eth_df$Vol.[m], start = 1, end = 4) |>
                  as.numeric() |>
                  map(\(x) x * 1000000)

eth_df$`Change %` <- eth_df$`Change %` |>
                     str_replace("%", "") |>
                     as.numeric()

### Step 3: Fit model and predict price
lm <- lm(Price ~ Date, data = eth_df)
days <- seq(ymd('2023-12-27'), ymd('2024-02-27'), by='1 day')
pred <- predict(lm, newdata = data.frame(Date = days))

### Step 4: Plot
eth_df[33:63, ] <- NA
ggplot(eth_df, aes(Date, Price)) +
  geom_line(linewidth = 1.5) +
  geom_line(aes(x = days, y = pred), col = "salmon", linewidth = 1.5) +
  theme_bw() +
  ggtitle("Linear Regression USD/ETH") +
  annotate(geom = "text", x = ymd('2024-02-25'), y = pred[63] + 20, label = "$2483.68") +
  geom_point(x = ymd('2024-02-27'), y = pred[63], col = "red", cex = 3)

More Resources

You can learn more about packages specific to Tidyverse on their website here!
Dplyr
Lubridate
Ggplot2
Tidyr
Readr
Purrr
Stringr
Data.table
Janitor