User Tools

Site Tools


r:document_classification

This is an old revision of the document!


Docu Classification

> c <- crude
> c <- tm_map(c, content_transformer(tolower))
> c <- tm_map(c, content_transformer(removeNumbers))
> c <- tm_map(c, content_transformer(removeNumbers))
> x <- TermDocumentMatrix(c)
> inspect(x[1:10, 1:10])
<<TermDocumentMatrix (terms: 10, documents: 10)>>
Non-/sparse entries: 7/93
Sparsity           : 93%
Maximal term length: 10
Weighting          : term frequency (tf)

            Docs
Terms        127 144 191 194 211 236 237 242 246 248
  -/,          0   0   0   0   0   0   0   4   0   0
  -foot        0   0   0   0   0   0   0   0   0   0
  -hour        0   0   0   0   0   0   0   0   0   0
  -member      0   0   0   0   0   0   0   1   0   0
  -nation      0   0   0   0   0   1   0   0   0   0
  "(it)        0   0   0   0   0   0   1   0   0   0
  "demand      0   1   0   0   0   0   0   0   0   0
  "expansion   0   0   0   0   0   0   0   0   0   0
  "for         0   0   0   0   0   0   1   0   0   0
  "growth      0   0   0   0   0   0   1   0   0   0
> c <- tm_map(c, content_transformer(removePunctuation))
> x <- TermDocumentMatrix(c)
> inspect(x[1:10, 1:10])
<<TermDocumentMatrix (terms: 10, documents: 10)>>
Non-/sparse entries: 13/87
Sparsity           : 87%
Maximal term length: 9
Weighting          : term frequency (tf)

           Docs
Terms       127 144 191 194 211 236 237 242 246 248
  abdulaziz   0   0   0   0   0   0   0   0   5   0
  ability     0   2   0   0   0   3   0   0   0   0
  able        0   0   0   0   0   0   0   0   0   0
  about       0   1   0   0   1   0   1   0   2   2
  above       0   2   0   0   0   3   0   0   0   2
  abroad      0   0   0   0   0   1   0   0   0   0
  accept      0   0   0   0   0   0   0   0   0   0
  accord      0   0   0   0   0   0   0   0   0   5
  according   0   0   0   0   0   0   0   0   0   0
  across      0   0   0   0   0   0   0   0   0   0
> c <- crude 
> c <- tm_map(c, content_transformer(tolower))
> c <- tm_map(c, content_transformer(removePunctuation))
> c <- tm_map(c, content_transformer(removeNumbers))
> x <- TermDocumentMatrix(c)
> inspect(x[1:10, 1:10])
<<TermDocumentMatrix (terms: 10, documents: 10)>>
Non-/sparse entries: 13/87
Sparsity           : 87%
Maximal term length: 9
Weighting          : term frequency (tf)

           Docs
Terms       127 144 191 194 211 236 237 242 246 248
  abdulaziz   0   0   0   0   0   0   0   0   5   0
  ability     0   2   0   0   0   3   0   0   0   0
  able        0   0   0   0   0   0   0   0   0   0
  about       0   1   0   0   1   0   1   0   2   2
  above       0   2   0   0   0   3   0   0   0   2
  abroad      0   0   0   0   0   1   0   0   0   0
  accept      0   0   0   0   0   0   0   0   0   0
  accord      0   0   0   0   0   0   0   0   0   5
  according   0   0   0   0   0   0   0   0   0   0
  across      0   0   0   0   0   0   0   0   0   0
> 
> y <- TermDocumentMatrix(x, control=list(weighting=weightTfIdf))
Error in UseMethod("TermDocumentMatrix", x) : 
  no applicable method for 'TermDocumentMatrix' applied to an object of class "c('TermDocumentMatrix', 'simple_triplet_matrix')"
> y <- TermDocumentMatrix(c, control=list(weighting=weightTfIdf))
> inspect(y[1:10, 1:10])
<<TermDocumentMatrix (terms: 10, documents: 10)>>
Non-/sparse entries: 13/87
Sparsity           : 87%
Maximal term length: 9
Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)

           Docs
Terms       127         144 191 194       211        236
  abdulaziz   0 0.000000000   0   0 0.0000000 0.00000000
  ability     0 0.015079700   0   0 0.0000000 0.02268204
  able        0 0.000000000   0   0 0.0000000 0.00000000
  about       0 0.003641675   0   0 0.0181086 0.00000000
  above       0 0.012792992   0   0 0.0000000 0.01924250
  abroad      0 0.000000000   0   0 0.0000000 0.01193903
  accept      0 0.000000000   0   0 0.0000000 0.00000000
  accord      0 0.000000000   0   0 0.0000000 0.00000000
  according   0 0.000000000   0   0 0.0000000 0.00000000
  across      0 0.000000000   0   0 0.0000000 0.00000000
           Docs
Terms              237 242        246         248
  abdulaziz 0.00000000   0 0.08575254 0.000000000
  ability   0.00000000   0 0.00000000 0.000000000
  able      0.00000000   0 0.00000000 0.000000000
  about     0.00378776   0 0.01049149 0.009408741
  above     0.00000000   0 0.00000000 0.016526179
  abroad    0.00000000   0 0.00000000 0.000000000
  accept    0.00000000   0 0.00000000 0.000000000
  accord    0.00000000   0 0.00000000 0.048700455
  according 0.00000000   0 0.00000000 0.000000000
  across    0.00000000   0 0.00000000 0.000000000
> freqTerms <- findFreqTerms(x, lowfreq=10)
> freqTerms
 [1] "about"      "and"        "are"        "barrel"    
 [5] "barrels"    "bpd"        "but"        "crude"     
 [9] "dlrs"       "for"        "from"       "government"
[13] "has"        "industry"   "its"        "kuwait"    
[17] "last"       "market"     "meeting"    "minister"  
[21] "mln"        "new"        "not"        "official"  
[25] "oil"        "one"        "opec"       "pct"       
[29] "price"      "prices"     "production" "reuter"    
[33] "said"       "saudi"      "sheikh"     "that"      
[37] "the"        "they"       "this"       "was"       
[41] "were"       "will"       "with"       "world"     
[45] "would"     
> findAssocs(x, "oil", 0.7)
$oil
     opec     named   clearly      late    prices    trying 
     0.87      0.81      0.79      0.79      0.79      0.79 
      who    winter   markets      said  analysts agreement 
     0.79      0.79      0.78      0.78      0.77      0.76 
emergency      that     above      they    buyers     fixed 
     0.74      0.74      0.73      0.73      0.71      0.71 
  through 
     0.70 

> 
r/document_classification.1481678347.txt.gz · Last modified: 2016/12/14 09:49 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki