#기간 2020년 11월 1일~12월 31일 Best 20 기사
#LDA는 토픽 모델링을 위한 가장 일반적인 알고리즘 중 하나. LDA는 말뭉치의 각 문서에 전체 말뭉치에서 발견되는 토픽의 혼합이 포함되어 있다고 가정 #토픽 구조는 알 수 없음. 토픽 자체가 아니라 단어와 문서로만 관찰할 수 있음. 구조가 알려지지 않았기 때문에 단어와 문서로 토픽 구조를 추론.
#20개 BEST기사에 대해 4개 토픽 LDA 모델을 생성.
library(tm)
## Loading required package: NLP
library(topicmodels)
library(SnowballC)
library(tidytext)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## √ tibble 3.0.1 √ purrr 0.3.4
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
##데이터불러오기
setwd('D:/★이수진/★분석/기사분석/LDA202101')
#20개 텍스트 파일들을 리스트 형태로 저장
filenames <- list.files(getwd(),pattern='*.txt')
#문자형 벡터로 읽어오기
files <- lapply(filenames, readLines)
#벡터소스에서 말뭉치 만들기
docs <- Corpus(VectorSource(files))
writeLines(as.character(docs[[1]]))
## c("Vials labelled \xe2\u0080쏞OVID-19 / Coronavirus vaccine / Injection only\xe2\u0080\x9d and a syringe in front of an AstraZeneca logo in this Oct. 31 illustration / Reuters-YonhapBy Kim Yoo-chulThe government has signed an agreement with AstraZeneca to secure 25 million doses of its COVID-19 vaccine, two sources directly involved with the matter said Monday.혻They said SK Bioscience would handle the manufacture of the vaccine, known as AZD1222, for domestic distribution.According to the sources, the size of the deal is known to be \xe2\u0080쏿 few million dollars.\xe2\u0080\x9d \xe2\u0080쏷he government agreed with AstraZeneca to secure a supply of its coronavirus vaccine,\xe2\u0080\x9d a government official said, asking not to be identified. \xe2\u0080쏰ther relevant details will be released within this week, at the earliest possible date.\xe2\u0080\x9d The official said the value of the deal would be equivalent to those the pharmaceutical company recently signed with Thailand and the Philippines. Thailand signed a $200 million deal to procure 26 million doses of the AstraZeneca vaccine developed in collaboration with Oxford University. In the Philippines, more than 30 companies signed an agreement to buy at least 2.6 million doses from AstraZeneca, according to media reports. AstraZeneca was unavailable for comments.Another government official said the government was in the \xe2\u0080쐄inal stages\xe2\u0080\x9d of fixing \xe2\u0080쐓ome outstanding issues,\xe2\u0080\x9d such as guaranteed vaccine amount and total payment, as ruling and opposition lawmakers were negotiating on the amount of a third COVID-19 emergency relief funding package and what it should cover.\xe2\u0080쏷he health ministry will take the central role \xe2\u0080\x95 government officials and health experts are on track to narrow their views over pricing and supply volume,\xe2\u0080\x9d the official said, adding that officials were working on how to prioritize vaccine recipients.Prime Minister Chung Sye-kyun said Korea was in a \xe2\u0080쐁ritical phase\xe2\u0080\x9d after new daily COVID-19 infections soared above the 400 threshold. Chung told the public that the government would secure 30 million vaccine doses, although ruling party lawmakers are asking relevant agencies to secure more.On a related note, AstraZeneca signed a deal with Korea's non-listed SK Bioscience to manufacture its vaccine products. The collaboration calls for the SK affiliate to manufacture AZD1222 for local and global markets. The drug constituents are now being produced at SK's vaccine manufacturing facility in Andong, 270 kilometers southeast of Seoul with SK's local contractors handling international distribution, according to SK Bioscience.AZD1222, the result of collaboration between Oxford University and AstraZeneca, is drawing attention around the world because it was the first vaccine material to be given permission to conduct Phase 3 clinical trials.",
## "", "")
##데이터 전처리
#기호 제거
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, '', x))})
docs <- tm_map(docs, toSpace, '-')
## Warning in tm_map.SimpleCorpus(docs, toSpace, "-"): transformation drops
## documents
docs <- tm_map(docs, toSpace, ':')
## Warning in tm_map.SimpleCorpus(docs, toSpace, ":"): transformation drops
## documents
docs <- tm_map(docs, toSpace, '??')
## Warning in tm_map.SimpleCorpus(docs, toSpace, "?\u0080?"): transformation drops
## documents
docs <- tm_map(docs, toSpace, '??')
## Warning in tm_map.SimpleCorpus(docs, toSpace, "?\u0080?"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "'")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "'"): transformation drops
## documents
#구두점, 숫자, 중지 단어, 공백 제거
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
## documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
#사용자 지정 불용어 정의 및 제거
myStopwords <- c('can','say','said','will','like','even','well','one', 'hour', 'also', 'take', 'well','now','new', 'use', 'the')
docs <- tm_map(docs, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(docs, removeWords, myStopwords): transformation
## drops documents
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, removeSpecialChars)
## Warning in tm_map.SimpleCorpus(docs, removeSpecialChars): transformation drops
## documents
#소문자로 변환
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
#문서 정리
docs <- tm_map(docs,stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
writeLines(as.character(docs[[5]]))
## nanci kpop girl group momoland courtesi mld entertainmentbi dong sunhwan kpop girl band momoland includ he most beauti face the singer rank th tc candler he most beauti face list unveil dec tc candler produc nnual independ critic list rank celebr around globe base appear it consid grace eleg among mani other evalu star other kpop star includ twice tzuyu includ list year nanci debut member momoland minialbum welcom momoland the sixpiec act releas string hit includ bboom bboom thumb up it latest releas eadi not came nov
#현재 데이터 프레임: 1행에 한용어가 있는 형태, 토픽모델 패키지에는 DocumentTermMatrix필요, 다음과 같이 만든다.
dtm <- DocumentTermMatrix(docs)
dtm
## <<DocumentTermMatrix (documents: 20, terms: 2185)>>
## Non-/sparse entries: 3748/39952
## Sparsity : 91%
## Maximal term length: 19
## Weighting : term frequency (tf)
#토픽모델링, 20개 기사에 대한 LDA 모델(K=4로 설정)을 추정하여 4개의 LDA모델 생성
Best10_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
Best10_lda
## A LDA_VEM topic model with 4 topics.
#단어 토픽 확률
#한 행에 하나의 토픽. 베타는 각 조합에 대해 해당 용어가 생성될 확률 #예를 들어 accord는 토픽 1에 생성될 확률이 1.09e-3이지만 토픽 2에 생성될 확률은 3.55e-03
Best10_topics <- tidy(Best10_lda, matrix = "beta")
Best10_topics
## # A tibble: 8,740 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 accord 1.09e- 3
## 2 2 accord 3.55e- 3
## 3 3 accord 7.20e- 4
## 4 4 accord 1.66e- 3
## 5 1 affili 1.43e-231
## 6 2 affili 1.74e-228
## 7 3 affili 2.66e-230
## 8 4 affili 2.50e- 3
## 9 1 agenc 2.18e- 3
## 10 2 agenc 3.32e-223
## # ... with 8,730 more rows
#시각화
#이 4가지 토픽을 이해하기 위해 시각화(여러 토픽값 해본 후 4개가 최적이라 판단) #4가지 토픽은 일반적으로 다음을 설명
#1. BTS, Performance, series
#2. Lee, actor, lyricist
#3. Series, drama, award, kim, music
#4. Korea, award, year, asiana, vaccin, korean
Best10_top_terms <- Best10_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
Best10_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() + ggtitle('Top terms in each LDA topic')
#문서주제확률
#토픽 모델링은 각 토픽의 단어의 혼합으로 추정하는 것 외에 다음과 같은 토픽의 혼합으로 모델링
#감마는 해당 토픽에서 생성된 해당 문서의 예상 단어 비율.
#예를 들어 모델은 문서 1의 단어 중 약 0.0000624%가 토픽 1에서 생성된 것으로 추정
#이 결과 확인을 위해 문서 1에서 가장 일반적인 단어가 무엇인지 확인 Vaccin, astrazeneca, goverment
#문서2의 단어 중 0.0000381%가 토픽 2에서 생성된 것으로 추정,
#문서 2에서 가장 일반적인 단어 확인 Award, Music, organ
#문서 3의 단어 중 1.00%가 토픽 3에서 생성된 것으로 추정.
#문서 3에서 가장 일반적인 단어 확인 BTS, band, fan
#문서 4의 단어 중 0.000132%가 토픽 4에서 생성된 것으로 추정.
#문서 4에서 가장 일반적인 단어 확인 kakao, first, season
Best10_lda_gamma <- tidy(Best10_lda, matrix = "gamma")
Best10_lda_gamma
## # A tibble: 80 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 1 1 0.0000624
## 2 2 1 0.0000381
## 3 3 1 1.00
## 4 4 1 0.000132
## 5 5 1 0.000186
## 6 6 1 1.00
## 7 7 1 0.000108
## 8 8 1 0.000102
## 9 9 1 1.00
## 10 10 1 0.0000382
## # ... with 70 more rows
tidy(dtm) %>%
filter(document == 1) %>%
arrange(desc(count))
## # A tibble: 161 x 3
## document term count
## <chr> <chr> <dbl>
## 1 1 vaccin 10
## 2 1 astrazeneca 8
## 3 1 govern 7
## 4 1 million 6
## 5 1 offici 6
## 6 1 sign 5
## 7 1 deal 4
## 8 1 dose 4
## 9 1 manufactur 4
## 10 1 secur 4
## # ... with 151 more rows
tidy(dtm) %>%
filter(document == 2) %>%
arrange(desc(count))
## # A tibble: 263 x 3
## document term count
## <chr> <chr> <dbl>
## 1 2 award 24
## 2 2 music 15
## 3 2 organ 9
## 4 2 event 7
## 5 2 judg 5
## 6 2 mama 5
## 7 2 chart 4
## 8 2 credibl 4
## 9 2 data 4
## 10 2 han 4
## # ... with 253 more rows
tidy(dtm) %>%
filter(document == 3) %>%
arrange(desc(count))
## # A tibble: 428 x 3
## document term count
## <chr> <chr> <dbl>
## 1 3 bts 15
## 2 3 band 9
## 3 3 fan 8
## 4 3 social 7
## 5 3 year 7
## 6 3 act 6
## 7 3 group 6
## 8 3 song 6
## 9 3 billboard 5
## 10 3 feel 5
## # ... with 418 more rows
tidy(dtm) %>%
filter(document == 4) %>%
arrange(desc(count))
## # A tibble: 78 x 3
## document term count
## <chr> <chr> <dbl>
## 1 4 kakao 6
## 2 4 first 4
## 3 4 season 4
## 4 4 coupl 3
## 5 4 drama 3
## 6 4 iti 3
## 7 4 kim 3
## 8 4 love 3
## 9 4 seri 3
## 10 4 way 3
## # ... with 68 more rows
tidy(dtm) %>%
filter(document == 5) %>%
arrange(desc(count))
## # A tibble: 58 x 3
## document term count
## <chr> <chr> <dbl>
## 1 5 includ 4
## 2 5 momoland 4
## 3 5 kpop 3
## 4 5 list 3
## 5 5 bboom 2
## 6 5 beauti 2
## 7 5 candler 2
## 8 5 face 2
## 9 5 girl 2
## 10 5 most 2
## # ... with 48 more rows
잘못된 내용 무엇이든 조언 부탁드립니다.
RPubs - 영어기사 20개로 LDA 토픽 분석
rpubs.com
※코드 출처: https://medium.com/swiftworld/topic-modeling-of-new-york-times-articles-11688837d32f
Topic Modeling of New York Times Articles
(This article first appeared on my website)
medium.com