Data Screening Part 1

Ziyuan Huang

Last Updated: 2025-12-18

Data Screening

An Important Note

What Order Should be followed?

Why is this the order?

Data Screening Example

library(rio)
master <- import("data/data_screening.csv")
str(master)
## 'data.frame':    137 obs. of  20 variables:
##  $ Sex     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Age     : int  17 16 13 15 16 15 12 14 13 18 ...
##  $ SES     : int  2 3 2 3 2 3 3 3 1 3 ...
##  $ Grade   : int  11 7 5 6 11 7 4 6 4 9 ...
##  $ Absences: int  2 2 2 2 6 2 1 2 2 5 ...
##  $ RS1     : int  6 7 5 5 7 7 1 2 6 7 ...
##  $ RS2     : int  4 1 NA 5 4 7 6 3 6 6 ...
##  $ RS3     : int  2 1 5 5 4 7 NA 2 7 5 ...
##  $ RS4     : int  2 5 7 7 7 7 1 3 6 NA ...
##  $ RS5     : int  4 7 4 5 4 7 4 2 1 6 ...
##  $ RS6     : int  7 7 6 6 4 7 7 3 4 6 ...
##  $ RS7     : int  7 7 7 5 7 7 7 3 6 10 ...
##  $ RS8     : int  4 7 6 7 7 7 NA 3 6 7 ...
##  $ RS9     : int  5 4 6 6 4 7 NA 2 2 7 ...
##  $ RS10    : int  7 7 7 7 7 7 4 2 5 7 ...
##  $ RS11    : int  4 1 NA 6 7 7 3 3 6 5 ...
##  $ RS12    : int  7 7 5 6 4 7 7 3 6 6 ...
##  $ RS13    : int  4 4 7 6 7 7 7 2 6 6 ...
##  $ RS14    : int  7 7 6 6 4 7 7 3 2 6 ...
##  $ Health  : int  6 6 2 2 6 4 3 6 1 5 ...

Accuracy

Accuracy: Categorical Variables

notypos <- master #update the dataset with each step 
apply(notypos[ , c("Sex", "SES")], 2, table)
##   Sex SES
## 1  64   9
## 2  72  56
## 3   1  72
#3 here for sex is probably incorrect

Accuracy: Categorical Variables

## fix the categorical labels and typos
notypos$Sex <- factor(notypos$Sex, 
                     levels = c(1,2), #no 3
                     labels = c("Women", "Men"))
notypos$SES <- factor(notypos$SES, 
                     levels = c(1,2, 3),
                     labels = c("Low", "Medium", "High"))
apply(notypos[ , c("Sex", "SES")], 2, table)
## $Sex
## 
##   Men Women 
##    72    64 
## 
## $SES
## 
##   High    Low Medium 
##     72      9     56

Accuracy: Continuous Variables

summary(notypos)
##     Sex          Age            SES         Grade           Absences     
##  Women:64   Min.   :11.00   Low   : 9   Min.   : 1.000   Min.   : 1.000  
##  Men  :72   1st Qu.:13.00   Medium:56   1st Qu.: 4.000   1st Qu.: 2.000  
##  NA's : 1   Median :15.50   High  :72   Median : 6.000   Median : 2.000  
##             Mean   :15.05               Mean   : 5.883   Mean   : 3.358  
##             3rd Qu.:17.00               3rd Qu.: 8.000   3rd Qu.: 5.000  
##             Max.   :18.00               Max.   :35.000   Max.   :35.000  
##             NA's   :5                                                    
##       RS1             RS2             RS3             RS4            RS5       
##  Min.   :1.000   Min.   :1.000   Min.   : 1.00   Min.   :1.00   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.: 3.00   1st Qu.:3.00   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median : 4.00   Median :5.00   Median :4.000  
##  Mean   :4.858   Mean   :4.962   Mean   : 4.42   Mean   :4.55   Mean   :4.448  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.: 6.00   3rd Qu.:7.00   3rd Qu.:6.750  
##  Max.   :7.000   Max.   :7.000   Max.   :18.00   Max.   :7.00   Max.   :7.000  
##  NA's   :3       NA's   :5       NA's   :6       NA's   :6      NA's   :3      
##       RS6         RS7              RS8             RS9             RS10      
##  Min.   :1   Min.   : 1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4   1st Qu.: 4.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :5   Median : 5.000   Median :5.000   Median :5.000   Median :5.500  
##  Mean   :5   Mean   : 5.082   Mean   :4.764   Mean   :4.692   Mean   :5.313  
##  3rd Qu.:7   3rd Qu.: 7.000   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :7   Max.   :10.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##  NA's   :7   NA's   :3        NA's   :10      NA's   :7       NA's   :3      
##       RS11            RS12            RS13             RS14     
##  Min.   :1.000   Min.   :1.000   Min.   : 1.000   Min.   :1.00  
##  1st Qu.:3.000   1st Qu.:5.000   1st Qu.: 4.000   1st Qu.:4.00  
##  Median :5.000   Median :6.000   Median : 6.000   Median :5.00  
##  Mean   :4.641   Mean   :5.552   Mean   : 5.507   Mean   :4.97  
##  3rd Qu.:6.500   3rd Qu.:7.000   3rd Qu.: 7.000   3rd Qu.:7.00  
##  Max.   :7.000   Max.   :7.000   Max.   :15.000   Max.   :7.00  
##  NA's   :6       NA's   :3       NA's   :3        NA's   :5     
##      Health     
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :4.000  
##  Mean   :3.839  
##  3rd Qu.:6.000  
##  Max.   :7.000  
## 

How do we “fix” issues?

Accuracy: Continuous Variables

summary(notypos$Grade)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   6.000   5.883   8.000  35.000
notypos$Grade[ notypos$Grade > 12 ] <- NA
summary(notypos$Grade)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   6.000   5.669   8.000  11.000       1
summary(notypos$Absences)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   3.358   5.000  35.000
notypos$Absences[ notypos$Absences > 15 ] <- NA
summary(notypos$Absences)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   2.000   3.125   5.000   7.000       1

Accuracy: Continuous Variables

names(notypos)
##  [1] "Sex"      "Age"      "SES"      "Grade"    "Absences" "RS1"     
##  [7] "RS2"      "RS3"      "RS4"      "RS5"      "RS6"      "RS7"     
## [13] "RS8"      "RS9"      "RS10"     "RS11"     "RS12"     "RS13"    
## [19] "RS14"     "Health"
head(notypos[ , 6:19]) #lots of ways to do this part!
##   RS1 RS2 RS3 RS4 RS5 RS6 RS7 RS8 RS9 RS10 RS11 RS12 RS13 RS14
## 1   6   4   2   2   4   7   7   4   5    7    4    7    4    7
## 2   7   1   1   5   7   7   7   7   4    7    1    7    4    7
## 3   5  NA   5   7   4   6   7   6   6    7   NA    5    7    6
## 4   5   5   5   7   5   6   5   7   6    7    6    6    6    6
## 5   7   4   4   7   4   4   7   7   4    7    7    4    7    4
## 6   7   7   7   7   7   7   7   7   7    7    7    7    7    7
notypos[ , 6:19][ notypos[ , 6:19] > 7 ] <- NA
summary(notypos)
##     Sex          Age            SES         Grade           Absences    
##  Women:64   Min.   :11.00   Low   : 9   Min.   : 1.000   Min.   :1.000  
##  Men  :72   1st Qu.:13.00   Medium:56   1st Qu.: 4.000   1st Qu.:2.000  
##  NA's : 1   Median :15.50   High  :72   Median : 6.000   Median :2.000  
##             Mean   :15.05               Mean   : 5.669   Mean   :3.125  
##             3rd Qu.:17.00               3rd Qu.: 8.000   3rd Qu.:5.000  
##             Max.   :18.00               Max.   :11.000   Max.   :7.000  
##             NA's   :5                   NA's   :1        NA's   :1      
##       RS1             RS2             RS3             RS4            RS5       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:3.00   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median :4.000   Median :5.00   Median :4.000  
##  Mean   :4.858   Mean   :4.962   Mean   :4.315   Mean   :4.55   Mean   :4.448  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.00   3rd Qu.:6.750  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.00   Max.   :7.000  
##  NA's   :3       NA's   :5       NA's   :7       NA's   :6      NA's   :3      
##       RS6         RS7             RS8             RS9             RS10      
##  Min.   :1   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :5   Median :5.000   Median :5.000   Median :5.000   Median :5.500  
##  Mean   :5   Mean   :5.008   Mean   :4.764   Mean   :4.692   Mean   :5.313  
##  3rd Qu.:7   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :7   Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##  NA's   :7   NA's   :5       NA's   :10      NA's   :7       NA's   :3      
##       RS11            RS12            RS13            RS14          Health     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:5.000   1st Qu.:4.000   1st Qu.:4.00   1st Qu.:2.000  
##  Median :5.000   Median :6.000   Median :6.000   Median :5.00   Median :4.000  
##  Mean   :4.641   Mean   :5.552   Mean   :5.364   Mean   :4.97   Mean   :3.839  
##  3rd Qu.:6.500   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.00   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.00   Max.   :7.000  
##  NA's   :6       NA's   :3       NA's   :5       NA's   :5

Accuracy: Continuous Variables

Accuracy: Continuous Variables

names(notypos)
##  [1] "Sex"      "Age"      "SES"      "Grade"    "Absences" "RS1"     
##  [7] "RS2"      "RS3"      "RS4"      "RS5"      "RS6"      "RS7"     
## [13] "RS8"      "RS9"      "RS10"     "RS11"     "RS12"     "RS13"    
## [19] "RS14"     "Health"
apply(notypos[ , -c(1,3)], 2, mean, na.rm = T)
##       Age     Grade  Absences       RS1       RS2       RS3       RS4       RS5 
## 15.053030  5.669118  3.125000  4.858209  4.962121  4.315385  4.549618  4.447761 
##       RS6       RS7       RS8       RS9      RS10      RS11      RS12      RS13 
##  5.000000  5.007576  4.763780  4.692308  5.313433  4.641221  5.552239  5.363636 
##      RS14    Health 
##  4.969697  3.839416
apply(notypos[ , -c(1,3)], 2, sd, na.rm = T)
##      Age    Grade Absences      RS1      RS2      RS3      RS4      RS5 
## 1.931318 2.647305 1.815316 1.840043 1.722794 1.821638 2.116651 1.971859 
##      RS6      RS7      RS8      RS9     RS10     RS11     RS12     RS13 
## 1.787120 1.888075 1.945515 1.825089 1.696604 1.905735 1.656916 1.549552 
##     RS14   Health 
## 1.824094 1.749931

Missing Data

summary(notypos)
##     Sex          Age            SES         Grade           Absences    
##  Women:64   Min.   :11.00   Low   : 9   Min.   : 1.000   Min.   :1.000  
##  Men  :72   1st Qu.:13.00   Medium:56   1st Qu.: 4.000   1st Qu.:2.000  
##  NA's : 1   Median :15.50   High  :72   Median : 6.000   Median :2.000  
##             Mean   :15.05               Mean   : 5.669   Mean   :3.125  
##             3rd Qu.:17.00               3rd Qu.: 8.000   3rd Qu.:5.000  
##             Max.   :18.00               Max.   :11.000   Max.   :7.000  
##             NA's   :5                   NA's   :1        NA's   :1      
##       RS1             RS2             RS3             RS4            RS5       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:3.00   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median :4.000   Median :5.00   Median :4.000  
##  Mean   :4.858   Mean   :4.962   Mean   :4.315   Mean   :4.55   Mean   :4.448  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.00   3rd Qu.:6.750  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.00   Max.   :7.000  
##  NA's   :3       NA's   :5       NA's   :7       NA's   :6      NA's   :3      
##       RS6         RS7             RS8             RS9             RS10      
##  Min.   :1   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4   1st Qu.:4.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :5   Median :5.000   Median :5.000   Median :5.000   Median :5.500  
##  Mean   :5   Mean   :5.008   Mean   :4.764   Mean   :4.692   Mean   :5.313  
##  3rd Qu.:7   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :7   Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##  NA's   :7   NA's   :5       NA's   :10      NA's   :7       NA's   :3      
##       RS11            RS12            RS13            RS14          Health     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:5.000   1st Qu.:4.000   1st Qu.:4.00   1st Qu.:2.000  
##  Median :5.000   Median :6.000   Median :6.000   Median :5.00   Median :4.000  
##  Mean   :4.641   Mean   :5.552   Mean   :5.364   Mean   :4.97   Mean   :3.839  
##  3rd Qu.:6.500   3rd Qu.:7.000   3rd Qu.:7.000   3rd Qu.:7.00   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.00   Max.   :7.000  
##  NA's   :6       NA's   :3       NA's   :5       NA's   :5
apply(notypos, 2, function(x) { sum(is.na(x)) })
##      Sex      Age      SES    Grade Absences      RS1      RS2      RS3 
##        1        5        0        1        1        3        5        7 
##      RS4      RS5      RS6      RS7      RS8      RS9     RS10     RS11 
##        6        3        7        5       10        7        3        6 
##     RS12     RS13     RS14   Health 
##        3        5        5        0

Missing Data

Types of Missing Data

What do I do with missing data?

What do I do with missing data?

What do I do with missing data?

Visualize Missing Data

library(VIM, quietly = T)
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(notypos, numbers = T)

Replacing Missing Data: Rows

percentmiss <- function(x){ sum(is.na(x))/length(x) * 100 }
missing <- apply(notypos, 1, percentmiss)
table(missing)
## missing
##   0   5  10  15  20  70 
## 108  17   4   4   1   3

Replacing Missing Data: Rows

replace_rows <- subset(notypos, missing <= 5) #5%
noreplace_rows <- subset(notypos, missing > 5)

nrow(notypos)
## [1] 137
nrow(replace_rows)
## [1] 125
nrow(noreplace_rows)
## [1] 12

Replacing Missing Data: Columns

apply(replace_rows, 2, percentmiss)
##      Sex      Age      SES    Grade Absences      RS1      RS2      RS3 
##      0.8      3.2      0.0      0.8      0.8      0.0      0.0      0.8 
##      RS4      RS5      RS6      RS7      RS8      RS9     RS10     RS11 
##      0.0      0.0      1.6      0.0      1.6      0.0      0.0      0.8 
##     RS12     RS13     RS14   Health 
##      0.0      1.6      1.6      0.0

Replacing Missing Data: Columns

replace_columns <- replace_rows[ , -c(1,2,4)]
noreplace_columns <- replace_rows[ , c(1,2,4)] #notice these are both replace_rows

Replacing Missing Data: Using mice

library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
temp_no_miss <- mice(replace_columns)
## 
##  iter imp variable
##   1   1  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   1   2  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   1   3  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   1   4  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   1   5  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   2   1  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   2   2  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   2   3  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   2   4  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   2   5  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   3   1  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   3   2  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   3   3  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   3   4  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   3   5  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   4   1  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   4   2  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   4   3  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   4   4  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   4   5  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   5   1  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   5   2  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   5   3  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   5   4  Absences  RS3  RS6  RS8  RS11  RS13  RS14
##   5   5  Absences  RS3  RS6  RS8  RS11  RS13  RS14

Replacing Missing Data: Using mice

nomiss <- complete(temp_no_miss, 1) #pick a dataset 1-5 

#combine back together
dim(notypos) #original data from previous step
## [1] 137  20
dim(nomiss) #replaced data
## [1] 125  17
#get all columns 
all_columns <- cbind(noreplace_columns, nomiss)
dim(all_columns)
## [1] 125  20
#get all rows
all_rows <- rbind(noreplace_rows, all_columns)
dim(all_rows)
## [1] 137  20

Outliers

Outliers: Types

Outliers: Mahalanobis

Outliers: Analyze and Eliminate

## you can use all columns or all rows here
## however, all rows has missing data, which will not get a score 
str(all_columns)
## 'data.frame':    125 obs. of  20 variables:
##  $ Sex     : Factor w/ 2 levels "Women","Men": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age     : int  17 16 15 16 15 14 13 15 16 17 ...
##  $ Grade   : int  11 7 6 11 7 6 4 6 8 11 ...
##  $ SES     : Factor w/ 3 levels "Low","Medium",..: 2 3 3 2 3 3 1 2 2 3 ...
##  $ Absences: int  2 2 2 6 2 2 2 2 1 6 ...
##  $ RS1     : int  6 7 5 7 7 2 6 4 3 4 ...
##  $ RS2     : int  4 1 5 4 7 3 6 6 5 3 ...
##  $ RS3     : int  2 1 5 4 7 2 7 6 3 4 ...
##  $ RS4     : int  2 5 7 7 7 3 6 6 2 4 ...
##  $ RS5     : int  4 7 5 4 7 2 1 6 1 4 ...
##  $ RS6     : int  7 7 6 4 7 3 4 6 7 4 ...
##  $ RS7     : int  7 7 5 7 7 3 6 6 4 4 ...
##  $ RS8     : int  4 7 7 7 7 3 6 6 3 7 ...
##  $ RS9     : int  5 4 6 4 7 2 2 6 3 4 ...
##  $ RS10    : int  7 7 7 7 7 2 5 6 3 4 ...
##  $ RS11    : int  4 1 6 7 7 3 6 6 3 6 ...
##  $ RS12    : int  7 7 6 4 7 3 6 6 2 5 ...
##  $ RS13    : int  4 4 6 7 7 2 6 6 5 6 ...
##  $ RS14    : int  7 7 6 4 7 3 2 6 3 4 ...
##  $ Health  : int  6 6 2 6 4 6 1 2 3 1 ...
mahal <- mahalanobis(all_columns[ , -c(1,4)],
                    colMeans(all_columns[ , -c(1,4)], na.rm=TRUE),
                    cov(all_columns[ , -c(1,4)], use ="pairwise.complete.obs"))

Outliers: Analyze and Eliminate

## remember to match the number of columns
cutoff <- qchisq(1-.001, ncol(all_columns[ , -c(1,4)]))

## df and cutoff
ncol(all_columns[ , -c(1,4)])
## [1] 18
cutoff
## [1] 42.3124
##how many outliers? Look at FALSE
summary(mahal < cutoff)
##    Mode   FALSE    TRUE    NA's 
## logical       1     119       5
## eliminate
noout <- subset(all_columns, mahal < cutoff)
dim(all_columns)
## [1] 125  20
dim(noout)
## [1] 119  20

Summary

References