library(dplyr)
library(rvest)
library(wordcloud)

抓取整個頁面內容

full_page <- read_html("https://stackoverflow.com/questions?sort=frequent&page=1")
full_page
## {xml_document}
## <html class="html__responsive">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="questions-page unified-theme">\r\n    <div id="notify-c ...

找出在資料在頁面中的位置後提取資料

tag_nodes <- html_nodes(full_page, "#questions .post-tag")
head(tag_nodes)
## {xml_nodeset (6)}
## [1] <a href="/questions/tagged/java" class="post-tag" title="show questi ...
## [2] <a href="/questions/tagged/nullpointerexception" class="post-tag" ti ...
## [3] <a href="/questions/tagged/javascript" class="post-tag" title="show  ...
## [4] <a href="/questions/tagged/ajax" class="post-tag" title="show questi ...
## [5] <a href="/questions/tagged/asynchronous" class="post-tag" title="sho ...
## [6] <a href="/questions/tagged/xmlhttprequest" class="post-tag" title="s ...
tags <- html_text(tag_nodes)
head(tags)
## [1] "java"                 "nullpointerexception" "javascript"          
## [4] "ajax"                 "asynchronous"         "xmlhttprequest"

使用 for-loop 抓取多個頁面的標籤

page <- 10
all_tags <- character(0)
for(i in 1:page){
    tags <- paste0("https://stackoverflow.com/questions?sort=frequent&page=", i) %>%
        read_html %>%
        html_nodes("#questions .post-tag") %>%
        html_text
    all_tags <- c(all_tags, tags)
}   
head(all_tags, 40)
##  [1] "java"                   "nullpointerexception"  
##  [3] "javascript"             "ajax"                  
##  [5] "asynchronous"           "xmlhttprequest"        
##  [7] "event-loop"             "r"                     
##  [9] "r-faq"                  "php"                   
## [11] "mysql"                  "sql"                   
## [13] "security"               "sql-injection"         
## [15] "html"                   "regex"                 
## [17] "xhtml"                  "c#"                    
## [19] ".net"                   "vb.net"                
## [21] "null"                   "nullreferenceexception"
## [23] "php"                    "arrays"                
## [25] "variables"              "warnings"              
## [27] "undefined-index"        "java"                  
## [29] "string"                 "equality"              
## [31] "php"                    "mysql"                 
## [33] "database"               "c++"                   
## [35] "c++-faq"                "php"                   
## [37] "parsing"                "debugging"             
## [39] "syntax-error"           "math"

計算標籤出現的次數後做成文字雲

freq <- table(all_tags)
head(sort(freq, decreasing=T))
wordcloud(names(freq), freq, min.freq = 1, scale=c(4,.2), max.words=200, random.order=FALSE, colors=brewer.pal(5,"Dark2"))
1000頁文字雲結果

1000頁文字雲結果

Note: 上方程式碼僅示範爬10個頁面,最後文字雲圖片為1000個頁面的爬蟲結果