In this report, I made some visualization from the data, showing some fun facts.

Get started

Loading library

library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)

Imorting data and preprocessing

df <- read.csv("export_replaced.csv", fileEncoding = "UTF-8", stringsAsFactors = F) %>%
    select(c("TotalXp", "NumExercisesCompleted", "NumCoursesCompleted", "CompletedCourses")) %>%
    replace_na(list(CompletedCourses = ""))

Have a look on the dataset

head(df)
##   TotalXp NumExercisesCompleted NumCoursesCompleted
## 1   14750                   157                   2
## 2   11450                   132                   2
## 3   14500                   152                   1
## 4   19350                   205                   3
## 5    6000                    61                   0
## 6   12400                   136                   1
##                                CompletedCourses
## 1                 Introduction to R, R 語言導論
## 2       Introduction to R, Introduction to Data
## 3                                    R 語言導論
## 4 Introduction to R, R 語言導論, Intermediate R
## 5                                              
## 6                             Introduction to R
str(df)
## 'data.frame':    44 obs. of  4 variables:
##  $ TotalXp              : int  14750 11450 14500 19350 6000 12400 8470 10350 9600 18615 ...
##  $ NumExercisesCompleted: int  157 132 152 205 61 136 101 109 100 213 ...
##  $ NumCoursesCompleted  : int  2 2 1 3 0 1 0 1 1 2 ...
##  $ CompletedCourses     : chr  "Introduction to R, R 語言導論" "Introduction to R, Introduction to Data" "R 語言導論" "Introduction to R, R 語言導論, Intermediate R" ...

Analysis

Tidy up the columns

To make visualization better, we need to tidy up the data, first we find out who has finished HW1.

df %<>%
    mutate(Passed = TotalXp >= 10000)

head(df)
##   TotalXp NumExercisesCompleted NumCoursesCompleted
## 1   14750                   157                   2
## 2   11450                   132                   2
## 3   14500                   152                   1
## 4   19350                   205                   3
## 5    6000                    61                   0
## 6   12400                   136                   1
##                                CompletedCourses Passed
## 1                 Introduction to R, R 語言導論   TRUE
## 2       Introduction to R, Introduction to Data   TRUE
## 3                                    R 語言導論   TRUE
## 4 Introduction to R, R 語言導論, Intermediate R   TRUE
## 5                                                FALSE
## 6                             Introduction to R   TRUE

And since the column “CoursesFinished” is too messy to visualize, we need to change it’s format.

# turn string into vector
df %<>%
    mutate(CompletedCourses=strsplit(CompletedCourses, split=", "))

# find unique names
courseNames = df$CompletedCourses %>%
    unlist %>%
    unique

# create new columns in value of boolean
for(name in courseNames){
    df[, name] = sapply(df$CompletedCourses, function(x) is.element(name, x))
}

df = select(df, -c("CompletedCourses"))

head(df)
##   TotalXp NumExercisesCompleted NumCoursesCompleted Passed
## 1   14750                   157                   2   TRUE
## 2   11450                   132                   2   TRUE
## 3   14500                   152                   1   TRUE
## 4   19350                   205                   3   TRUE
## 5    6000                    61                   0  FALSE
## 6   12400                   136                   1   TRUE
##   Introduction to R R 語言導論 Introduction to Data Intermediate R
## 1              TRUE       TRUE                FALSE          FALSE
## 2              TRUE      FALSE                 TRUE          FALSE
## 3             FALSE       TRUE                FALSE          FALSE
## 4              TRUE       TRUE                FALSE           TRUE
## 5             FALSE      FALSE                FALSE          FALSE
## 6              TRUE      FALSE                FALSE          FALSE
##   Introduction to Python 資料框整理技巧 Intro to SQL for Data Science
## 1                  FALSE          FALSE                         FALSE
## 2                  FALSE          FALSE                         FALSE
## 3                  FALSE          FALSE                         FALSE
## 4                  FALSE          FALSE                         FALSE
## 5                  FALSE          FALSE                         FALSE
## 6                  FALSE          FALSE                         FALSE
##   Intermediate Python for Data Science Exploratory Data Analysis
## 1                                FALSE                     FALSE
## 2                                FALSE                     FALSE
## 3                                FALSE                     FALSE
## 4                                FALSE                     FALSE
## 5                                FALSE                     FALSE
## 6                                FALSE                     FALSE
##   Exploratory Data Analysis in R: Case Study Introduction to the Tidyverse
## 1                                      FALSE                         FALSE
## 2                                      FALSE                         FALSE
## 3                                      FALSE                         FALSE
## 4                                      FALSE                         FALSE
## 5                                      FALSE                         FALSE
## 6                                      FALSE                         FALSE
##   Data Manipulation in R with dplyr Importing Data in R (Part 1)
## 1                             FALSE                        FALSE
## 2                             FALSE                        FALSE
## 3                             FALSE                        FALSE
## 4                             FALSE                        FALSE
## 5                             FALSE                        FALSE
## 6                             FALSE                        FALSE
##   Working with the RStudio IDE (Part 1) Reporting with R Markdown
## 1                                 FALSE                     FALSE
## 2                                 FALSE                     FALSE
## 3                                 FALSE                     FALSE
## 4                                 FALSE                     FALSE
## 5                                 FALSE                     FALSE
## 6                                 FALSE                     FALSE

Visualizations

Courses finished count

# Count CoursesFinished
df[,courseNames] %>%
    colSums %>%
    data.frame(key=names(.), value=.) %>%
    ggplot(aes(key, value)) +
    geom_bar(stat="identity") +
    geom_text(aes(label=courseNames), position=position_dodge(width=0.9), hjust=-0.1, angle = 90) +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank())

# beautify
df[,courseNames] %>%
    colSums %>%
    sort(decreasing=T) %>%
    data.frame(key=names(.), value=.) %>%
    filter(value > 5) %>%
    ggplot(aes(reorder(key, -value), value, fill=reorder(key, -value))) +
    geom_bar(stat="identity") +
    theme(axis.title.x=element_blank()) +
    ylab("人數") +
    guides(fill=guide_legend(title="課程名稱")) +
    geom_text(aes(label=value), position=position_dodge(width=0.9), vjust=-0.6) +
    ylim(0, 33)

Histogram of TotalXp

# TotalXP
df %>%
    mutate(TotalXp = TotalXp+1) %>% # 剛好10000分的人會被算在線的左邊
    ggplot(aes(TotalXp, fill=Passed)) +
    geom_histogram(binwidth=4000) +
    geom_vline(aes(xintercept=10000), color="black", linetype="dashed", size=0.5)

print(paste0("完成比例: ", sum(df$Passed) / nrow(df) * 100, "%"))
## [1] "完成比例: 75%"
# boxplot
ggplot(df, aes(y=TotalXp)) + 
    geom_boxplot()

Who finished it twice?

# finished "Introduction to R" in two language
both = df %>%
    filter(`Introduction to R` == T, `R 語言導論`==T) %>%
    nrow

# finished "Introduction to R" in CH
inCH = sum(df[,"R 語言導論"])

# finished "Introduction to R" in EN
inEN = sum(df[,"Introduction to R"])

# visualize
data.frame(count= c(both, inCH, inEN), item=factor(c("both", "CH", "EN"))) %>%
    ggplot(aes(x=item, y=count, fill=item)) +
    geom_bar(stat="identity") +
    geom_text(aes(label=count), position=position_dodge(width=0.9), vjust=-0.6) +
    ylim(0, 33)

# how many xp do they get?
df %>%
    filter(`Introduction to R` == T, `R 語言導論`==T) %>%
    mutate(TotalXp = TotalXp+1, Passed=(TotalXp > 16200)) %>%
    ggplot(aes(TotalXp, fill=Passed)) +
    geom_histogram(binwidth=2000) +
    geom_vline(aes(xintercept=16200), color="black", linetype="dashed", size=0.5)

Scatter plot: “TotalXp” over “NumCoursesCompleted”

df %>%
    ggplot(aes(x=NumCoursesCompleted, y=TotalXp)) +
    geom_point(aes(col=Passed), position=position_jitter(width=.05)) +
    geom_smooth(method='lm',formula=y~x, se=F)