Data Visualization

Ziyuan Huang

Last Updated: 2025-12-11

Programming Suggestion

textline +
  stat_summary(fun = mean,
               geom = "point") +
  stat_summary(fun = mean,
               geom = "line", 
               aes(group = Group)) +
  stat_summary(fun.data = mean_cl_normal,
               geom = "errorbar",
               width = .2) + 
  xlab("Measurement Time") +
  ylab("Mean Grammar Score") +
  cleanup +
  scale_color_manual(name = "Texting Option",
                     labels = c("All the texts", "None of the texts"),
                     values = c("Black", "Grey")) +
  scale_x_discrete(labels = c("Baseline", "Six Months"))

Outline

Working with Files

library(rio)
chickflick <- import("data/ChickFlick.sav")
str(chickflick)
## 'data.frame':    40 obs. of  3 variables:
##  $ gender : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Gender of Participant"
##   ..- attr(*, "format.spss")= chr "F8.0"
##   ..- attr(*, "labels")= Named num [1:2] 1 2
##   .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
##  $ film   : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Name of Film"
##   ..- attr(*, "format.spss")= chr "F8.0"
##   ..- attr(*, "display_width")= int 18
##   ..- attr(*, "labels")= Named num [1:2] 1 2
##   .. ..- attr(*, "names")= chr [1:2] "Bridget Jones's Diary" "Memento"
##  $ arousal: num  22 13 16 10 18 24 13 14 19 23 ...
##   ..- attr(*, "label")= chr "Psychological Arousal During the Film"
##   ..- attr(*, "format.spss")= chr "F8.0"

Factor Categorical Variables

table(chickflick$gender)
## 
##  1  2 
## 20 20
table(chickflick$film)
## 
##  1  2 
## 20 20

How to Factor

chickflick$gender <- factor(chickflick$gender, #the variable you want to factor
                            levels = c(1,2), #the information already in the data
                            labels = c("Male", "Female")) #the labels for those levels

table(chickflick$gender)
## 
##   Male Female 
##     20     20

Data Structure Format

Rearrange Data: Wide to Long

library(reshape) #note: modern alternative is tidyr::pivot_longer()
cricket <- import("data/Jiminy Cricket.csv")
head(cricket)
##   ID Strategy Success_Pre Success_Post
## 1  1        1          53           74
## 2  2        1          62           67
## 3  3        1          52           33
## 4  4        1          57           62
## 5  5        1          55           44
## 6  6        1          52           65

Rearrange Data: Wide to Long

longcricket <- melt(cricket,  #name of dataset
                    id = c("ID", "Strategy"), 
                    measured = c("Success_Pre", "Success_Post"))
#you can actually leave measured blank
head(longcricket)
##   ID Strategy    variable value
## 1  1        1 Success_Pre    53
## 2  2        1 Success_Pre    62
## 3  3        1 Success_Pre    52
## 4  4        1 Success_Pre    57
## 5  5        1 Success_Pre    55
## 6  6        1 Success_Pre    52

Rearrange Data: Wide to Long

colnames(longcricket)[3:4] #just to figure out which ones
## [1] "variable" "value"
colnames(longcricket)[3:4] <- c("Time", "Score")

The Art of Presenting Data

Why is this Graph Bad?

Other Bad Design Choices

Why is this Graph Better?

Do Not Deceive the Reader!

Plotting in R

library(ggplot2)
library(Hmisc)

Working with ggplot2

#an example 
myGraph <- ggplot(dataset,
                  aes(x_axis, y_axis, 
                      color = legend_var, 
                      fill = legend_var))

Working with ggplot2

#an example part 2
myGraph + 
  geom_bar() +
  geom_point() +
  xlab("X Axis Label") + 
  ylab("Y Axis Label")

Histograms

Histogram: Example

crickethist <- ggplot(data = cricket, #dataset
                      aes(x = Success_Pre) #only define X axis 
                      )
crickethist

Histogram: Example

crickethist + 
  geom_histogram()

Histogram: Example

crickethist + 
  geom_histogram(binwidth = 1)

Histogram: Example

crickethist + 
  geom_histogram(binwidth = 1, color = 'purple', fill = 'magenta')

Histogram: Example

crickethist + 
  geom_histogram(binwidth = 1, color = 'purple', fill = 'magenta') + 
  xlab("Success Pre Test") + 
  ylab("Frequency")

Histogram: Example 2

festival <- import("data/festival.csv")
str(festival)
## 'data.frame':    810 obs. of  5 variables:
##  $ ticknumb: int  2111 2229 2338 2384 2401 2405 2467 2478 2490 2504 ...
##  $ gender  : chr  "Male" "Female" "Male" "Female" ...
##  $ day1    : num  2.64 0.97 0.84 3.03 0.88 0.85 1.56 3.02 2.29 1.11 ...
##  $ day2    : num  1.35 1.41 NA NA 0.08 NA NA NA NA 0.44 ...
##  $ day3    : num  1.61 0.29 NA NA NA NA NA NA NA 0.55 ...

Histogram: Example 2

festivalhist <- ggplot(data = festival, aes(x = day1)) 
festivalhist + 
  geom_histogram(binwidth = 1, color = 'blue') + 
  xlab("Day 1 of Festival Hygiene") +
  ylab("Frequency") +
  theme_bw() #theme_classic() also good!

Focus on these Facets

Clean Up?

cleanup <- theme(panel.grid.major = element_blank(), #no grid lines
                panel.grid.minor = element_blank(), #no grid lines
                panel.background = element_blank(), #no background
                axis.line.x = element_line(color = 'black'), #black x axis line
                axis.line.y = element_line(color = 'black'), #black y axis line
                legend.key = element_rect(fill = 'white'), #no legend background
                text = element_text(size = 15)) #bigger text size

Clean Up?

festivalhist + 
  geom_histogram(binwidth = 1, color = 'blue') + 
  xlab("Day 1 of Festival Hygiene") +
  ylab("Frequency") +
  cleanup

Scatterplots

Scatterplots: Example

exam <- import("data/Exam Anxiety.csv")
str(exam)
## 'data.frame':    103 obs. of  5 variables:
##  $ Code   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Revise : int  4 11 27 53 4 22 16 21 25 18 ...
##  $ Exam   : int  40 65 80 80 40 70 20 55 50 40 ...
##  $ Anxiety: num  86.3 88.7 70.2 61.3 89.5 ...
##  $ Gender : int  1 2 1 1 1 2 2 2 2 2 ...

Scatterplots: Example

table(exam$Gender)
## 
##  1  2 
## 52 51
exam$Gender <- factor(exam$Gender,
                     levels = c(1,2),
                     labels = c("Male", "Female"))
table(exam$Gender)
## 
##   Male Female 
##     52     51

Simple Scatterplot

scatter <- ggplot(exam, aes(Anxiety, Exam))
scatter +
  geom_point() +
  xlab("Anxiety Score") +
  ylab("Exam Score") +
  cleanup

Simple Scatterplot with Regression Line

scatter + 
  geom_point() +
  geom_smooth(method = 'lm', formula = y ~ x, 
              color = 'black', fill = 'blue') +
  xlab('Anxiety Score') +
  ylab('Exam Score') +
  cleanup

Grouped Scatterplot

Grouped Scatterplot with Regression Line

scatter2 <- ggplot(exam, aes(Anxiety, Exam, 
                             color = Gender, fill = Gender)) #why both?
scatter2 +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x) +
  xlab("Anxiety Score") +
  ylab("Exam Score") +
  cleanup + 
  scale_fill_manual(name = "Gender of Participant",
                    labels = c("Men", "Women"),
                    values = c("purple", "grey")) +
  scale_color_manual(name = "Gender of Participant",
                     labels = c("Men", "Women"),
                     values = c("purple", "grey10"))

GGally for Multiple Visualization

library(GGally)
ggpairs(data = exam[ , -1], #no participant variable
        title = "Exam Anxiety, Scores, and Gender")

Bar Graphs

Bar Graph: One Independent Variable

Bar Chart: One Independent Variable

str(chickflick) #already fixed gender
## 'data.frame':    40 obs. of  3 variables:
##  $ gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  $ film   : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Name of Film"
##   ..- attr(*, "format.spss")= chr "F8.0"
##   ..- attr(*, "display_width")= int 18
##   ..- attr(*, "labels")= Named num [1:2] 1 2
##   .. ..- attr(*, "names")= chr [1:2] "Bridget Jones's Diary" "Memento"
##  $ arousal: num  22 13 16 10 18 24 13 14 19 23 ...
##   ..- attr(*, "label")= chr "Psychological Arousal During the Film"
##   ..- attr(*, "format.spss")= chr "F8.0"
chickflick$film <- factor(chickflick$film,
                    levels = c(1,2),
                    labels = c("Bridget Jones", "Memento"))

Bar Chart: One Independent Variable Example

chickbar <- ggplot(chickflick, aes(film, arousal))
chickbar + 
  stat_summary(fun = mean,
               geom = "bar",
               fill = "White", 
               color = "Black") +
  cleanup

Bar Chart: One Independent Variable Example

chickbar + 
  stat_summary(fun = mean,
               geom = "bar",
               fill = "White", 
               color = "Black") +
  stat_summary(fun.data = mean_cl_normal, 
               geom = "errorbar", 
               position = position_dodge(width = 0.90), 
               width = 0.2) +
  cleanup

Bar Chart: One Independent Variable Example

chickbar + 
  stat_summary(fun = mean,
               geom = "bar",
               fill = "White", 
               color = "Black") +
  stat_summary(fun.data = mean_cl_normal, 
               geom = "errorbar", 
               position = position_dodge(width = 0.90), 
               width = 0.2) +
  xlab("Movie Watched by Participant") +
  ylab("Arousal Level") +
  cleanup +
  scale_x_discrete(labels = c("Girl Film", "Guy Film"))

Bar Chart: Two Independent Variables

chickbar2 <- ggplot(chickflick, aes(film, arousal, fill = gender))
chickbar2 +
  stat_summary(fun = mean,
               geom = "bar",
               position = "dodge") +
  stat_summary(fun.data = mean_cl_normal,
               geom = "errorbar", 
               position = position_dodge(width = 0.90),
               width = .2) +
  xlab("Film Watched") +
  ylab("Arousal Level") + 
  cleanup +
  scale_fill_manual(name = "Gender of Participant", 
                    labels = c("Boys", "Girls"),
                    values = c("Gray30", "Gray"))

Line Graphs

Line Graphs: One Independent Variable

hiccups <- import("data/Hiccups.csv")
str(hiccups)
## 'data.frame':    15 obs. of  4 variables:
##  $ Baseline: int  15 13 9 7 11 14 20 9 17 19 ...
##  $ Tongue  : int  9 18 17 15 18 8 3 16 10 10 ...
##  $ Carotid : int  7 7 5 10 7 10 7 12 9 8 ...
##  $ Other   : int  2 4 4 5 4 3 3 3 4 4 ...

Line Graphs: One Independent Variable

longhiccups <- melt(hiccups, 
                    measured = c("Baseline", "Tongue", "Carotid", "Other"))
str(longhiccups)
## 'data.frame':    60 obs. of  2 variables:
##  $ variable: Factor w/ 4 levels "Baseline","Tongue",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : int  15 13 9 7 11 14 20 9 17 19 ...
colnames(longhiccups) <- c("Intervention", "Hiccups")

Line Graphs: One Independent Variable

hiccupline <- ggplot(longhiccups, aes(Intervention, Hiccups))
hiccupline +
  stat_summary(fun = mean, ##adds the points
               geom = "point") +
  stat_summary(fun = mean, ##adds the line
               geom = "line",
               aes(group=1)) + ##necessary for mapping line to dots
  stat_summary(fun.data = mean_cl_normal, ##adds the error bars
               geom = "errorbar", 
               width = .2) +
  xlab("Intervention Type") +
  ylab("Number of Hiccups") + 
  cleanup

Line Graphs: Two Independent Variables

Line Graphs: Two Independent Variables

texting <- import("data/Texting.xlsx")
str(texting)
## 'data.frame':    50 obs. of  3 variables:
##  $ Group     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Baseline  : num  52 68 85 47 73 57 63 50 66 60 ...
##  $ Six_months: num  32 48 62 16 63 53 59 58 59 57 ...

Line Graphs: Two Independent Variables

texting$Group <- factor(texting$Group,
                       levels = c(1,2),
                       labels = c("Texting Allowed", "No Texting Allowed"))
longtexting <- melt(texting,
                   id = c("Group"),
                   measured = c("Baseline", "Six_months"))
str(longtexting)
## 'data.frame':    100 obs. of  3 variables:
##  $ Group   : Factor w/ 2 levels "Texting Allowed",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ variable: Factor w/ 2 levels "Baseline","Six_months": 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  52 68 85 47 73 57 63 50 66 60 ...
colnames(longtexting) <- c("Group", "Time", "Grammar_Score")

Line Graphs: Two Independent Variables

textline <- ggplot(longtexting, aes(Time, Grammar_Score, color = Group))
textline +
  stat_summary(fun = mean,
               geom = "point") +
  stat_summary(fun = mean,
               geom = "line", 
               aes(group = Group)) + #Group is the variable name
  stat_summary(fun.data = mean_cl_normal,
               geom = "errorbar",
               width = .2) + 
  xlab("Measurement Time") +
  ylab("Mean Grammar Score") +
  cleanup +
  scale_color_manual(name = "Texting Option",
                     labels = c("All the texts", "None of the texts"),
                     values = c("Black", "Grey")) +
  scale_x_discrete(labels = c("Baseline", "Six Months"))

Summary

Additional Resources