Text mining

Using the tm package

Using the corpus already prepared for us.

library(tm)
## Loading required package: NLP
data(crude)
crude
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 20
inspect(crude[1:3])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## $`reut-00001.xml`
## <<PlainTextDocument>>
## Metadata:  15
## Content:  chars: 527
## 
## $`reut-00002.xml`
## <<PlainTextDocument>>
## Metadata:  15
## Content:  chars: 2634
## 
## $`reut-00004.xml`
## <<PlainTextDocument>>
## Metadata:  15
## Content:  chars: 330

Create a Term Document Matrix

tdm <- TermDocumentMatrix(crude)
tdm
## <<TermDocumentMatrix (terms: 1266, documents: 20)>>
## Non-/sparse entries: 2255/23065
## Sparsity           : 91%
## Maximal term length: 17
## Weighting          : term frequency (tf)
my_matrix <- as.matrix(tdm)
dim(my_matrix)
## [1] 1266   20
table(my_matrix == 0)
## 
## FALSE  TRUE 
##  2255 23065
# sparsity is the number of zeros
# i.e. words that are not present in documents
23065/(23065+2255)
## [1] 0.91094

Some functions that can be used on tdm.

class(tdm)
## [1] "TermDocumentMatrix"    "simple_triplet_matrix"
methods(class = "TermDocumentMatrix")
##  [1] as.DocumentTermMatrix as.TermDocumentMatrix c                    
##  [4] dimnames<-            Docs                  findAssocs           
##  [7] inspect               nDocs                 nTerms               
## [10] plot                  print                 [                    
## [13] Terms                 tm_term_score         t                    
## see '?methods' for accessing help and source code
methods(class = "simple_triplet_matrix")
##  [1] aperm      as.array   as.matrix  as.vector  cbind      c         
##  [7] dimnames<- dimnames   dim<-      dim        duplicated is.numeric
## [13] Math       mean       Ops        print      rbind      [<-       
## [19] [          split      Summary    t          unique    
## see '?methods' for accessing help and source code
findFreqTerms(x = tdm, lowfreq = 10)
##  [1] "about"      "and"        "are"        "bpd"        "but"       
##  [6] "crude"      "dlrs"       "for"        "from"       "government"
## [11] "has"        "its"        "kuwait"     "last"       "market"    
## [16] "mln"        "new"        "not"        "official"   "oil"       
## [21] "one"        "opec"       "pct"        "price"      "prices"    
## [26] "reuter"     "said"       "said."      "saudi"      "sheikh"    
## [31] "that"       "the"        "they"       "u.s."       "was"       
## [36] "were"       "will"       "with"       "would"
# limit matrix to specific words
inspect(DocumentTermMatrix(crude,
                           list(dictionary = c("government", "market", "official"))))
## <<DocumentTermMatrix (documents: 20, terms: 3)>>
## Non-/sparse entries: 15/45
## Sparsity           : 75%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## 
##      Terms
## Docs  government market official
##   127          0      0        0
##   144          0      3        0
##   191          0      0        0
##   194          0      0        0
##   211          0      0        0
##   236          0      0        5
##   237          5      0        0
##   242          0      1        1
##   246          6      0        0
##   248          0      4        1
##   273          0      1        4
##   349          0      1        2
##   352          0      1        1
##   353          0      0        0
##   368          0      0        0
##   489          0      0        0
##   502          0      0        0
##   543          0      0        0
##   704          0      1        0
##   708          0      0        0
findAssocs(x = tdm, terms = 'government', corlimit = 0.8)
## $government
##    early     pct. positive     been      say    since 
##     1.00     1.00     1.00     0.94     0.91     0.82

Simple analysis on the matrix.

head(sort(rowSums(my_matrix), decreasing = TRUE))
##  the  oil  and said  for  its 
##  229   80   77   52   50   40
cor(my_matrix[,1], my_matrix[,2])
## [1] 0.4910128
set.seed(31)
my_cluster <- kmeans(x = t(my_matrix), centers = 3)
my_cluster$cluster
## 127 144 191 194 211 236 237 242 246 248 273 349 352 353 368 489 502 543 
##   3   1   3   3   3   1   2   3   2   1   1   3   3   3   3   3   3   3 
## 704 708 
##   2   3
meta(crude, 'heading')[my_cluster$cluster == 1]
## $`144`
## [1] "OPEC MAY HAVE TO MEET TO FIRM PRICES - ANALYSTS"
## 
## $`236`
## [1] "KUWAIT SAYS NO PLANS FOR EMERGENCY OPEC TALKS"
## 
## $`248`
## [1] "SAUDI ARABIA REITERATES COMMITMENT TO OPEC PACT"
## 
## $`273`
## [1] "SAUDI FEBRUARY CRUDE OUTPUT PUT AT 3.5 MLN BPD"
meta(crude, 'heading')[my_cluster$cluster == 2]
## $`237`
## [1] "INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE"
## 
## $`246`
## [1] "QATAR UNVEILS BUDGET FOR FISCAL 1987/88"
## 
## $`704`
## [1] "NYMEX WILL EXPAND OFF-HOUR TRADING APRIL ONE"

Published: February 18 2017

blog comments powered by Disqus