====== sna (social network analysis) and clustering ====== in R # library(igraph) data <- read.csv("http://commres.net/wiki/_media/r/socialnetworkdata.csv", header=T) head(data) str(data) y <- data.frame(data$first, data$second) head(y) # net <- graph.data.frame(y, directed=T) net <- graph_from_data_frame(y, directed=T) head(net) net V(net) # vertex in net data who.net <- V(net) # 52/52 vertices data.frame(who.net) E(net) # edge info in net data rel.net <- E(net) rel.net # output 290/290 edges # 52 by 290 data set V(net)$degree <- degree(net) degree.net <- data.frame(degree(net)) V(net)$degree str(who.net) who.net$name hist(V(net)$degree) set.seed(222) plot(net, vertex.color = 'lightblue', vertext.size = 2, edge.arrow.size = 0.1, vertex.label.cex = 0.8) plot(net, vertex.color = rainbow(52), vertex.size = V(net)$degree*0.3, edge.arrow.size = 0.1, layout=layout.fruchterman.reingold) # layout layout.fruchterman.reingold # layout.graphopt plot(net, vertex.color = rainbow(52), vertex.size = V(net)$degree*0.8, edge.arrow.size = 0.1, layout=layout.graphopt) plot(net, vertex.color = rainbow(52), vertex.size = V(net)$degree*0.4, edge.arrow.size = 0.1, layout=layout.kamada.kawai) betweenness(net) degree(net) closeness(net) hits_scores(net)$hub # outlinks hits_scores(net)$authority # inlinks hs <- hits_scores(net)$hub # outlinks as <- hits_scores(net)$authority # inlinks set.seed(123) plot(net, vertex.size=hs*30, main = 'Hubs', vertex.color = rainbow(52), edge.arrow.size=0.1, layout = layout.kamada.kawai) set.seed(123) plot(net, vertex.size=as*30, main = 'Authorities', vertex.color = rainbow(52), edge.arrow.size=0.1, layout = layout.kamada.kawai) net <- graph_from_data_frame(y, directed = F) cnet <- cluster_edge_betweenness(net) # cluster_fast_greedy(net) # cluster_fluid_communities(net, 2) # cluster_infomap(net) # cluster_label_prop() # cluster_spinglass() # cluster.distribution() # components(net, "strong") infomap <- cluster_infomap(net) # c.infomap <- cluster_infomap(net) plot(cnet, net, vertex.size = 10, vertex.label.cex = 0.8) plot(infomap, net, vertex.size = 10, vertex.label.cex = 0.8) ############################################################ # clustering in R # clusterdata.csv orgs <- read.csv("http://commres.net/wiki/_media/r/clusterdata.csv", header=T) str(orgs) head(orgs) pairs(orgs[2:9]) plot(Fuel_Cost ~ Sales, data = orgs) with(orgs, text(Fuel_Cost ~ Sales, labels=orgs$Company, pos=4)) # normalization (standardization) z <- orgs[,-c(1,1)] # remove the first column means <- apply(z, 2, mean) sds <- apply(z, 2, sd) means sds nor <- scale(z, center=means, scale=sds) nor distance <- dist(nor) distance orgs.hclust = hclust(distance) plot(orgs.hclust) plot(orgs.hclust, labels = orgs$Company, main='Default from hclust') plot(orgs.hclust, hang=-1, labels=orgs$Company, main='Default from hclust') orgs.hclust.average<-hclust(distance, method="average") plot(orgs.hclust.average, hang=-1, labels=orgs$Company, main='hclust average') plot(orgs.hclust.average, hang=-1, labels=orgs$Company, main='hclust average') rect.hclust(orgs.hclust.average, k=4, border='red') member.by.3 <- cutree(orgs.hclust, 3) table(member.by.3) member.by.3 member.by.5 <- cutree(orgs.hclust, 5) table(member.by.5) member.by.5 aggregate(nor,list(member.by.3),mean) aggregate(nor,list(member.by.3),sd) wss <- (nrow(nor)-1)*sum(apply(nor,2,var)) for (i in 2:20) wss[i] <- sum(kmeans(nor, centers=i)$withinss) plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # kmeans clustering set.seed(123) kc<-kmeans(nor, 3) kc # install.packages("cluster") library(cluster) ot<-nor datadistshortset <- dist(ot, method = "euclidean") hc1 <- hclust(datadistshortset, method = "complete" ) pamvshortset <- pam(datadistshortset, 4, diss = FALSE) clusplot(pamvshortset, shade = FALSE, labels=2, col.clus="blue", col.p="red", span=FALSE, main="Cluster Mapping", cex=1.2) # kmeans # install.packages("factoextra") library(factoextra) k2 <- kmeans(nor, centers = 3, nstart = 25) str(k2) fviz_cluster(k2, data = nor) fviz_nbclust(nor, kmeans, method = "wss") fviz_nbclust(nor, kmeans, method = "silhouette") gap_stat <- clusGap(nor, FUN = kmeans, nstart = 25, K.max = 10, B = 50) fviz_gap_stat(gap_stat) ====== output ====== > > > # > library(igraph) > data <- read.csv("http://commres.net/wiki/_media/r/socialnetworkdata.csv", header=T) > head(data) first second grade spec 1 AA DD 6 Y 2 AB DD 6 R 3 AF BA 6 Q 4 DD DA 6 Q 5 CD EC 6 X 6 DD CE 6 Y > str(data) 'data.frame': 290 obs. of 4 variables: $ first : chr "AA" "AB" "AF" "DD" ... $ second: chr "DD" "DD" "BA" "DA" ... $ grade : int 6 6 6 6 6 6 6 6 6 6 ... $ spec : chr "Y" "R" "Q" "Q" ... > > y <- data.frame(data$first, data$second) > head(y) data.first data.second 1 AA DD 2 AB DD 3 AF BA 4 DD DA 5 CD EC 6 DD CE > > # net <- graph.data.frame(y, directed=T) > net <- graph_from_data_frame(y, directed=T) > head(net) 6 x 52 sparse Matrix of class "dgCMatrix" [[ suppressing 52 column names ‘AA’, ‘AB’, ‘AF’ ... ]] AA . . . 10 . . . . . . . . . . . . . . . . . . . . . . . . . . . AB . . . 1 . 4 . . . . . . . . . . . . . . . . . . . . . . . . . AF . . . . . 3 . . 2 . . . . 1 . . . . . . . . . . . . . . . . . DD . . . . 2 . . . . . . . . . . . . . . . . . . 2 . . . . . . . CD . . . 10 . . . 4 . 2 . 2 . . 1 . . . . . . . . . . . . . . . . BA . 2 5 . . . . . 1 1 . . . . . 1 . . . . . . . . 1 . . . . . . AA . . . . . . . . . 1 1 . . . . . . . . . . AB . . 1 . . . . . . . . . . . . . . . . . . AF . . . . . . . . . . . . . . . . . . . 1 . DD . . . . 4 . . . . . . . . . . . . . . . . CD . . . . . 3 1 . . . . . . . . 1 . 1 . . . BA . . . . . . . . . . . . . . . . . . 1 . 1 > net IGRAPH 612e768 DN-- 52 290 -- + attr: name (v/c) + edges from 612e768 (vertex names): [1] AA->DD AB->DD AF->BA DD->DA CD->EC DD->CE CD->FA CD->CC BA->AF [10] CB->CA CC->CA CD->CA BC->CA DD->DA ED->AD AE->AC AB->BA CD->EC [19] CA->CC EB->CC BF->CE BB->CD AC->AE CC->FB DC->BB BD->CF DB->DA [28] DD->DA DB->DD BC->AF CF->DE DF->BF CB->CA BE->CA EA->CA CB->CA [37] CB->CA CC->CA CD->CA BC->CA BF->CA CE->CA AC->AD BD->BE AE->DF [46] CB->DF AC->DF AA->DD AA->DD AA->DD CD->DD AA->DD EE->DD CD->DD [55] DB->AA AA->FC BE->CC EF->FD CF->FE BB->DD CD->DD BA->AB CD->EC [64] BE->EE CE->CC CD->CC ED->CC BB->CC BE->CE DD->CE AC->CD ED->CD + ... omitted several edges > > V(net) # vertex in net data + 52/52 vertices, named, from 612e768: [1] AA AB AF DD CD BA CB CC BC ED AE CA EB BF BB AC DC BD DB CF DF [22] BE EA CE EE EF FF FD GB GC GD AD KA KF LC DA EC FA FB DE FC FE [43] GA GE KB KC KD KE LB LA LD LE > who.net <- V(net) # 52/52 vertices > data.frame(who.net) who.net AA 1 AB 2 AF 3 DD 4 CD 5 BA 6 CB 7 CC 8 BC 9 ED 10 AE 11 CA 12 EB 13 BF 14 BB 15 AC 16 DC 17 BD 18 DB 19 CF 20 DF 21 BE 22 EA 23 CE 24 EE 25 EF 26 FF 27 FD 28 GB 29 GC 30 GD 31 AD 32 KA 33 KF 34 LC 35 DA 36 EC 37 FA 38 FB 39 DE 40 FC 41 FE 42 GA 43 GE 44 KB 45 KC 46 KD 47 KE 48 LB 49 LA 50 LD 51 LE 52 > > E(net) # edge info in net data + 290/290 edges from 612e768 (vertex names): [1] AA->DD AB->DD AF->BA DD->DA CD->EC DD->CE CD->FA CD->CC BA->AF [10] CB->CA CC->CA CD->CA BC->CA DD->DA ED->AD AE->AC AB->BA CD->EC [19] CA->CC EB->CC BF->CE BB->CD AC->AE CC->FB DC->BB BD->CF DB->DA [28] DD->DA DB->DD BC->AF CF->DE DF->BF CB->CA BE->CA EA->CA CB->CA [37] CB->CA CC->CA CD->CA BC->CA BF->CA CE->CA AC->AD BD->BE AE->DF [46] CB->DF AC->DF AA->DD AA->DD AA->DD CD->DD AA->DD EE->DD CD->DD [55] DB->AA AA->FC BE->CC EF->FD CF->FE BB->DD CD->DD BA->AB CD->EC [64] BE->EE CE->CC CD->CC ED->CC BB->CC BE->CE DD->CE AC->CD ED->CD [73] FF->CD AC->CD DD->CD DD->CD AE->GA AE->GA AE->GA AE->GA BA->ED [82] BE->ED EB->ED CD->ED FD->EF FD->EF CD->BB BF->BB BC->BB BB->CF + ... omitted several edges > rel.net <- E(net) > rel.net # output 290/290 edges + 290/290 edges from 612e768 (vertex names): [1] AA->DD AB->DD AF->BA DD->DA CD->EC DD->CE CD->FA CD->CC BA->AF [10] CB->CA CC->CA CD->CA BC->CA DD->DA ED->AD AE->AC AB->BA CD->EC [19] CA->CC EB->CC BF->CE BB->CD AC->AE CC->FB DC->BB BD->CF DB->DA [28] DD->DA DB->DD BC->AF CF->DE DF->BF CB->CA BE->CA EA->CA CB->CA [37] CB->CA CC->CA CD->CA BC->CA BF->CA CE->CA AC->AD BD->BE AE->DF [46] CB->DF AC->DF AA->DD AA->DD AA->DD CD->DD AA->DD EE->DD CD->DD [55] DB->AA AA->FC BE->CC EF->FD CF->FE BB->DD CD->DD BA->AB CD->EC [64] BE->EE CE->CC CD->CC ED->CC BB->CC BE->CE DD->CE AC->CD ED->CD [73] FF->CD AC->CD DD->CD DD->CD AE->GA AE->GA AE->GA AE->GA BA->ED [82] BE->ED EB->ED CD->ED FD->EF FD->EF CD->BB BF->BB BC->BB BB->CF + ... omitted several edges > > # 52 by 290 data set > > V(net)$degree <- degree(net) > degree.net <- data.frame(degree(net)) > V(net)$degree [1] 18 9 23 36 40 26 24 50 21 27 15 62 7 12 23 27 2 4 8 12 23 [22] 20 8 10 6 8 1 8 1 1 1 9 3 3 1 7 3 1 1 2 1 2 [43] 5 1 1 1 1 1 1 1 1 1 > > > str(who.net) 'igraph.vs' Named int [1:52] 1 2 3 4 5 6 7 8 9 10 ... - attr(*, "names")= chr [1:52] "AA" "AB" "AF" "DD" ... - attr(*, "is_all")= logi TRUE - attr(*, "env")= - attr(*, "graph")= chr "612e7682-aabd-11ef-9ea2-7d7f9645968c" > who.net$name [1] "AA" "AB" "AF" "DD" "CD" "BA" "CB" "CC" "BC" "ED" "AE" "CA" [13] "EB" "BF" "BB" "AC" "DC" "BD" "DB" "CF" "DF" "BE" "EA" "CE" [25] "EE" "EF" "FF" "FD" "GB" "GC" "GD" "AD" "KA" "KF" "LC" "DA" [37] "EC" "FA" "FB" "DE" "FC" "FE" "GA" "GE" "KB" "KC" "KD" "KE" [49] "LB" "LA" "LD" "LE" > > hist(V(net)$degree) > > set.seed(222) > plot(net, + vertex.color = 'lightblue', + vertext.size = 2, + edge.arrow.size = 0.1, + vertex.label.cex = 0.8) > > plot(net, + vertex.color = rainbow(52), + vertex.size = V(net)$degree*0.3, + edge.arrow.size = 0.1, + layout=layout.fruchterman.reingold) > # layout layout.fruchterman.reingold > # layout.graphopt > plot(net, + vertex.color = rainbow(52), + vertex.size = V(net)$degree*0.8, + edge.arrow.size = 0.1, + layout=layout.graphopt) > > plot(net, + vertex.color = rainbow(52), + vertex.size = V(net)$degree*0.4, + edge.arrow.size = 0.1, + layout=layout.kamada.kawai) > > betweenness(net) AA AB AF DD CD BA 89.196239 4.564734 109.762225 162.990640 375.992047 184.496219 CB CC BC ED AE CA 36.237779 225.269107 94.821412 343.016093 105.368424 292.157605 EB BF BB AC DC BD 0.000000 28.446981 244.963181 172.427422 0.000000 5.966115 DB CF DF BE EA CE 0.000000 181.814414 42.768995 55.964872 183.604022 11.943347 EE EF FF FD GB GC 4.002137 0.000000 0.000000 33.000000 0.000000 0.000000 GD AD KA KF LC DA 0.000000 48.996857 0.000000 17.229134 0.000000 0.000000 EC FA FB DE FC FE 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 GA GE KB KC KD KE 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 LB LA LD LE 0.000000 0.000000 0.000000 0.000000 > degree(net) AA AB AF DD CD BA CB CC BC ED AE CA EB BF BB AC DC BD DB CF DF BE 18 9 23 36 40 26 24 50 21 27 15 62 7 12 23 27 2 4 8 12 23 20 EA CE EE EF FF FD GB GC GD AD KA KF LC DA EC FA FB DE FC FE GA GE 8 10 6 8 1 8 1 1 1 9 3 3 1 7 3 1 1 2 1 2 5 1 KB KC KD KE LB LA LD LE 1 1 1 1 1 1 1 1 > closeness(net) AA AB AF DD CD 0.005988024 0.007874016 0.008196721 0.007352941 0.010101010 BA CB CC BC ED 0.009900990 0.010101010 0.008771930 0.008695652 0.012048193 AE CA EB BF BB 0.010000000 0.008264463 0.009009009 0.009090909 0.010869565 AC DC BD DB CF 0.011111111 0.008547009 0.008771930 0.005917160 0.009433962 DF BE EA CE EE 0.010869565 0.009433962 0.006711409 0.006849315 0.007751938 EF FF FD GB GC 0.005319149 0.007042254 0.005319149 0.006329114 0.004761905 GD AD KA KF LC 0.006369427 0.009174312 0.008620690 0.005347594 0.006097561 DA EC FA FB DE NaN NaN NaN NaN NaN FC FE GA GE KB NaN NaN NaN NaN NaN KC KD KE LB LA NaN NaN NaN NaN NaN LD LE NaN NaN > hits_scores(net)$hub # outlinks AA AB AF DD CD 8.042807e-02 1.300950e-02 4.731440e-03 1.255496e-02 2.350165e-01 BA CB CC BC ED 3.239013e-02 7.145888e-01 1.000000e+00 2.321828e-01 4.858357e-02 AE CA EB BF BB 7.949708e-02 4.494956e-02 1.870058e-01 1.272314e-01 1.150808e-01 AC DC BD DB CF 2.093518e-01 7.455515e-03 1.305960e-02 1.738176e-02 7.102816e-03 DF BE EA CE EE 2.505787e-02 2.195271e-01 5.693409e-02 6.319085e-02 1.273824e-02 EF FF FD GB GC 1.119098e-03 4.485458e-03 4.563724e-04 3.594258e-04 2.507046e-04 GD AD KA KF LC 6.374463e-03 9.643170e-03 6.276463e-02 4.418478e-04 5.681639e-02 DA EC FA FB DE 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 FC FE GA GE KB 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 KC KD KE LB LA 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 1.458925e-19 LD LE 1.458925e-19 1.458925e-19 > hits_scores(net)$authority # inlinks AA AB AF DD CD 4.412541e-03 3.968841e-03 8.375004e-02 1.409698e-01 7.894656e-02 BA CB CC BC ED 2.146204e-02 3.660444e-03 1.121941e-01 6.326094e-03 4.799095e-02 AE CA EB BF BB 1.524499e-02 1.000000e+00 0.000000e+00 6.072030e-03 8.323026e-02 AC DC BD DB CF 6.262068e-02 0.000000e+00 2.486679e-04 0.000000e+00 8.146610e-03 DF BE EA CE EE 1.925650e-01 1.095155e-01 7.776767e-03 2.437300e-02 2.419075e-02 EF FF FD GB GC 6.390996e-05 0.000000e+00 2.071608e-03 0.000000e+00 0.000000e+00 GD AD KA KF LC 0.000000e+00 1.806051e-02 0.000000e+00 2.156359e-03 0.000000e+00 DA EC FA FB DE 3.583779e-03 2.468362e-02 8.227873e-03 3.500976e-02 4.973358e-04 FC FE GA GE KB 2.815768e-03 3.064436e-03 1.391587e-02 3.500976e-02 3.500976e-02 KC KD KE LB LA 2.486679e-04 8.227873e-03 2.501759e-02 8.227873e-03 1.133971e-03 LD LE 1.656466e-04 1.133971e-03 > > hs <- hits_scores(net)$hub # outlinks > as <- hits_scores(net)$authority # inlinks > > > > set.seed(123) > plot(net, + vertex.size=hs*30, + main = 'Hubs', + vertex.color = rainbow(52), + edge.arrow.size=0.1, + layout = layout.kamada.kawai) > > set.seed(123) > plot(net, + vertex.size=as*30, + main = 'Authorities', + vertex.color = rainbow(52), + edge.arrow.size=0.1, + layout = layout.kamada.kawai) > > > net <- graph_from_data_frame(y, directed = F) > cnet <- cluster_edge_betweenness(net) > > # cluster_fast_greedy(net) > # cluster_fluid_communities(net, 2) > # cluster_infomap(net) > # cluster_label_prop() > # cluster_spinglass() > # cluster.distribution() > # components(net, "strong") > > infomap <- cluster_infomap(net) > # c.infomap <- cluster_infomap(net) > > plot(cnet, + net, + vertex.size = 10, + vertex.label.cex = 0.8) > > plot(infomap, + net, + vertex.size = 10, + vertex.label.cex = 0.8) > > ############################################################ > # clustering in R > # clusterdata.csv > orgs <- read.csv("http://commres.net/wiki/_media/r/clusterdata.csv", header=T) > str(orgs) 'data.frame': 22 obs. of 9 variables: $ Company : chr "Arizona " "Boston " "Central " "Commonwealth" ... $ Fixed_charge: num 1.06 0.89 1.43 1.02 1.49 1.32 1.22 1.1 1.34 1.12 ... $ RoR : num 9.2 10.3 15.4 11.2 8.8 13.5 12.2 9.2 13 12.4 ... $ Cost : int 151 202 113 168 192 111 175 245 168 197 ... $ Load : num 54.4 57.9 53 56 51.2 60 67.6 57 60.4 53 ... $ D.Demand : num 1.6 2.2 3.4 0.3 1 -2.2 2.2 3.3 7.2 2.7 ... $ Sales : int 9077 5088 9212 6423 3300 11127 7642 13082 8406 6455 ... $ Nuclear : num 0 25.3 0 34.3 15.6 22.5 0 0 0 39.2 ... $ Fuel_Cost : num 0.628 1.555 1.058 0.7 2.044 ... > head(orgs) Company Fixed_charge RoR Cost Load D.Demand Sales Nuclear 1 Arizona 1.06 9.2 151 54.4 1.6 9077 0.0 2 Boston 0.89 10.3 202 57.9 2.2 5088 25.3 3 Central 1.43 15.4 113 53.0 3.4 9212 0.0 4 Commonwealth 1.02 11.2 168 56.0 0.3 6423 34.3 5 Con Ed NY 1.49 8.8 192 51.2 1.0 3300 15.6 6 Florida 1.32 13.5 111 60.0 -2.2 11127 22.5 Fuel_Cost 1 0.628 2 1.555 3 1.058 4 0.700 5 2.044 6 1.241 > > pairs(orgs[2:9]) > > plot(Fuel_Cost ~ Sales, data = orgs) > with(orgs, text(Fuel_Cost ~ Sales, + labels=orgs$Company, + pos=4)) > > # normalization (standardization) > z <- orgs[,-c(1,1)] # remove the first column > means <- apply(z, 2, mean) > sds <- apply(z, 2, sd) > means Fixed_charge RoR Cost Load D.Demand 1.114091 10.736364 168.181818 56.977273 3.240909 Sales Nuclear Fuel_Cost 8914.045455 12.000000 1.102727 > sds Fixed_charge RoR Cost Load D.Demand 0.1845112 2.2440494 41.1913495 4.4611478 3.1182503 Sales Nuclear Fuel_Cost 3549.9840305 16.7919198 0.5560981 > > nor <- scale(z, center=means, scale=sds) > nor Fixed_charge RoR Cost Load [1,] -0.29315791 -0.68463896 -0.417122002 -0.57771516 [2,] -1.21451134 -0.19445367 0.821002037 0.20683629 [3,] 1.71214073 2.07822360 -1.339645796 -0.89153574 [4,] -0.50994695 0.20660702 -0.004413989 -0.21906307 [5,] 2.03732429 -0.86288816 0.578232617 -1.29501935 [6,] 1.11597086 1.23153991 -1.388199680 0.67756716 [7,] 0.57399826 0.65223002 0.165524604 2.38116460 [8,] -0.07636887 -0.68463896 1.864910540 0.00509449 [9,] 1.22436538 1.00872841 -0.004413989 0.76723019 [10,] 0.03202565 0.74135462 0.699617327 -0.89153574 [11,] -1.97327298 -1.44219805 0.116970720 -1.22777208 [12,] 0.08622291 0.07292013 0.238355430 1.12588228 [13,] 0.19461744 0.87504152 0.748171211 -0.73462545 [14,] -0.13056613 0.56310542 -1.752353809 -1.60883993 [15,] -0.83513051 -1.39763576 -0.101521757 1.17071379 [16,] 0.24881470 -0.37270287 2.034849134 -0.21906307 [17,] -1.91907572 -1.93238335 -0.781276132 1.10346652 [18,] -0.34735517 0.83047922 -0.441398944 -0.06215278 [19,] 0.24881470 0.42941852 -1.558138274 -0.66737818 [20,] 0.46560374 0.47398082 -0.489952828 0.65515141 [21,] -0.40155243 -0.95201276 0.869555920 0.90172472 [22,] -0.23896065 -0.64007666 0.141247662 -0.60013092 D.Demand Sales Nuclear Fuel_Cost [1,] -0.52622751 0.04590290 -0.7146294 -0.85367545 [2,] -0.33381191 -1.07776413 0.7920476 0.81329670 [3,] 0.05101929 0.08393124 -0.7146294 -0.08043055 [4,] -0.94312798 -0.70170610 1.3280197 -0.72420189 [5,] -0.71864311 -1.58142837 0.2143888 1.69263800 [6,] -1.74485965 0.62337028 0.6253007 0.24864810 [7,] -0.33381191 -0.35832428 -0.7146294 0.98772637 [8,] 0.01895002 1.17407698 -0.7146294 -1.42731528 [9,] 1.26965142 -0.14311204 -0.7146294 -0.43288637 [10,] -0.17346558 -0.69269198 1.6198267 -0.86266667 [11,] 1.04516655 2.40196983 -0.7146294 -0.60192130 [12,] 0.14722709 -0.77748109 -0.7146294 1.42829614 [13,] 1.01309729 -0.48874740 2.2749037 -1.03529809 [14,] -0.59036605 0.21379097 -0.7146294 -0.92560521 [15,] -1.07140505 -0.68902999 -0.6610322 0.53456889 [16,] 1.91103676 1.99351729 -0.7146294 -0.86806140 [17,] 1.84689822 -0.90142531 -0.2203441 1.46965575 [18,] -0.17346558 0.34534086 -0.7146294 0.00948165 [19,] -1.71279038 1.29379583 -0.7146294 -0.83928950 [20,] 0.08308855 -0.45832473 1.7329764 -0.72060540 [21,] 0.08308855 -0.63776215 -0.7146294 1.82211157 [22,] 0.85275095 0.33210137 0.8694658 0.36553395 attr(,"scaled:center") Fixed_charge RoR Cost Load D.Demand 1.114091 10.736364 168.181818 56.977273 3.240909 Sales Nuclear Fuel_Cost 8914.045455 12.000000 1.102727 attr(,"scaled:scale") Fixed_charge RoR Cost Load D.Demand 0.1845112 2.2440494 41.1913495 4.4611478 3.1182503 Sales Nuclear Fuel_Cost 3549.9840305 16.7919198 0.5560981 > > distance <- dist(nor) > distance 1 2 3 4 5 6 7 2 3.096154 3 3.679230 4.916465 4 2.462149 2.164213 4.107079 5 4.123129 3.852850 4.468735 4.127368 6 3.606269 4.218804 2.992760 3.201836 4.600183 7 3.901898 3.448346 4.217769 3.969367 4.596261 3.352919 8 2.737407 3.892524 4.990876 3.692949 5.155516 4.913953 4.364509 9 3.253851 3.957125 2.752623 3.753627 4.489900 3.730814 2.796298 10 3.099116 2.705330 3.934935 1.491427 4.045276 3.829058 4.506512 11 3.491163 4.792640 5.902882 4.864730 6.460986 6.004557 5.995814 12 3.223138 2.432568 4.031434 3.498769 3.603863 3.738824 1.660047 13 3.959637 3.434878 4.385973 2.577003 4.758059 4.554909 5.010221 14 2.113490 4.323825 2.742000 3.230069 4.818803 3.469268 4.914949 15 2.593481 2.501195 5.156977 3.190250 4.255251 4.065764 2.930142 16 4.033051 4.837051 5.264442 4.967244 5.816715 5.842268 5.042444 17 4.396680 3.623588 6.356548 4.893679 5.628591 6.099456 4.577294 18 1.877248 2.904409 2.723954 2.651532 4.338150 2.853942 2.949006 19 2.410434 4.634878 3.179392 3.464171 5.133791 2.581208 4.515428 20 3.174488 2.997481 3.733274 1.816465 4.385852 2.912401 3.541931 21 3.453407 2.318451 5.088018 3.884260 3.644137 4.628341 2.675404 22 2.509287 2.421916 4.109321 2.578463 3.771757 4.026935 4.000096 8 9 10 11 12 13 14 2 3 4 5 6 7 8 9 3.594824 10 3.673884 3.572023 11 3.462587 5.175240 5.081469 12 4.059770 2.735861 3.942171 5.208504 13 4.140607 3.658647 1.407032 5.309741 4.496249 14 4.335241 3.816443 3.610272 4.315584 4.335484 4.385649 15 3.849872 4.113606 4.264133 4.735659 2.328833 5.103646 4.239522 16 2.201457 3.627307 4.531420 3.429962 4.617791 4.406173 5.169314 17 5.426511 4.901037 5.484537 4.751387 3.497555 5.606577 5.558002 18 3.237409 2.428533 3.070750 3.945595 2.451935 3.780942 2.301050 19 4.107966 4.109049 4.130120 4.522319 4.414578 5.010864 1.876051 20 4.094283 2.948021 2.054393 5.352136 3.430937 2.226493 3.744430 21 3.977130 3.742680 4.361961 4.883977 1.384124 4.937119 4.926966 22 3.239374 3.208932 2.559945 3.436927 2.995066 2.739910 3.512207 15 16 17 18 19 20 21 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 5.175157 17 3.399659 5.559320 18 2.998784 3.973815 4.426129 19 4.030721 5.232256 6.089597 2.473696 20 3.782111 4.823711 4.866540 2.922392 3.903723 21 2.097150 4.568885 3.095002 3.185250 4.972551 4.145222 22 3.352644 3.457129 3.628061 2.548060 3.967618 2.618050 3.012264 > > orgs.hclust = hclust(distance) > > plot(orgs.hclust) > plot(orgs.hclust, + labels = orgs$Company, + main='Default from hclust') > plot(orgs.hclust, + hang=-1, + labels=orgs$Company, + main='Default from hclust') > > > orgs.hclust.average<-hclust(distance, method="average") > plot(orgs.hclust.average, hang=-1, + labels=orgs$Company, + main='hclust average') > > plot(orgs.hclust.average, hang=-1, + labels=orgs$Company, + main='hclust average') > rect.hclust(orgs.hclust.average, k=4, border='red') > > member.by.3 <- cutree(orgs.hclust, 3) > table(member.by.3) member.by.3 1 2 3 14 5 3 > member.by.3 [1] 1 1 1 1 1 1 2 3 1 1 3 2 1 1 2 3 2 1 1 1 2 1 > > member.by.5 <- cutree(orgs.hclust, 5) > table(member.by.5) member.by.5 1 2 3 4 5 7 6 1 5 3 > member.by.5 [1] 1 2 1 2 3 1 4 5 1 2 5 4 2 1 4 5 4 1 1 2 4 2 > > > aggregate(nor,list(member.by.3),mean) Group.1 Fixed_charge RoR Cost Load 1 1 0.3068832 0.4326015 -0.31481203 -0.3743722 2 2 -0.4991075 -0.7113763 0.07812761 1.3365904 3 3 -0.6002757 -0.8331800 1.33891013 -0.4805802 D.Demand Sales Nuclear Fuel_Cost 1 -0.2605107 -0.1575387 0.3692252 -0.2389329 2 0.1343994 -0.6728046 -0.6050529 1.2484717 3 0.9917178 1.8565214 -0.7146294 -0.9657660 > aggregate(nor,list(member.by.3),sd) Group.1 Fixed_charge RoR Cost Load D.Demand 1 1 0.9132159 0.8165532 0.8995420 0.7412840 0.9032477 2 2 0.9530092 1.0599834 0.5980215 0.5929804 1.0733867 3 3 1.2001155 0.5500030 1.0616363 0.6567217 0.9471751 Sales Nuclear Fuel_Cost 1 0.7392081 1.0933974 0.7974146 2 0.2022460 0.2163074 0.4969827 3 0.6253048 0.0000000 0.4212819 > > wss <- (nrow(nor)-1)*sum(apply(nor,2,var)) > for (i in 2:20) + wss[i] <- sum(kmeans(nor, centers=i)$withinss) > plot(1:20, wss, + type="b", + xlab="Number of Clusters", + ylab="Within groups sum of squares") > > > # kmeans clustering > set.seed(123) > kc<-kmeans(nor, 3) > kc K-means clustering with 3 clusters of sizes 7, 5, 10 Cluster means: Fixed_charge RoR Cost Load D.Demand 1 -0.23896065 -0.65917479 0.2556961 0.7992527 -0.05435116 2 0.51980100 1.02655333 -1.2959473 -0.5104679 -0.83409247 3 -0.09262805 -0.05185431 0.4689864 -0.3042429 0.45509205 Sales Nuclear Fuel_Cost 1 -0.8604593 -0.2884040 1.2497562 2 0.5120458 -0.4466434 -0.3174391 3 0.3462986 0.4252045 -0.7161098 Clustering vector: [1] 3 1 2 3 1 2 1 3 3 3 3 1 3 2 1 3 1 2 2 3 1 3 Within cluster sum of squares by cluster: [1] 34.16481 15.15613 57.53424 (between_SS / total_SS = 36.4 %) Available components: [1] "cluster" "centers" "totss" "withinss" [5] "tot.withinss" "betweenss" "size" "iter" [9] "ifault" > > # install.packages("cluster") > library(cluster) > > ot<-nor > datadistshortset <- dist(ot, method = "euclidean") > hc1 <- hclust(datadistshortset, method = "complete" ) > pamvshortset <- pam(datadistshortset, 4, diss = FALSE) > clusplot(pamvshortset, shade = FALSE, + labels=2, + col.clus="blue", + col.p="red", + span=FALSE, + main="Cluster Mapping", + cex=1.2) > > # kmeans > # install.packages("factoextra") > library(factoextra) > k2 <- kmeans(nor, centers = 3, nstart = 25) > str(k2) List of 9 $ cluster : int [1:22] 3 2 3 3 2 3 2 1 3 3 ... $ centers : num [1:3, 1:8] -0.6 -0.239 0.289 -0.833 -0.659 ... ..- attr(*, "dimnames")=List of 2 .. ..$ : chr [1:3] "1" "2" "3" .. ..$ : chr [1:8] "Fixed_charge" "RoR" "Cost" "Load" ... $ totss : num 168 $ withinss : num [1:3] 9.53 34.16 58.01 $ tot.withinss: num 102 $ betweenss : num 66.3 $ size : int [1:3] 3 7 12 $ iter : int 2 $ ifault : int 0 - attr(*, "class")= chr "kmeans" > > fviz_cluster(k2, data = nor) > fviz_nbclust(nor, kmeans, method = "wss") > fviz_nbclust(nor, kmeans, method = "silhouette") > > gap_stat <- clusGap(nor, FUN = kmeans, + nstart = 25, K.max = 10, B = 50) Clustering k = 1,2,..., K.max (= 10): .. done Bootstrapping, b = 1,2,..., B (= 50) [one "." per sample]: .................................................. 50 > fviz_gap_stat(gap_stat) > {{:pasted:20241125-084208.png}} {{:pasted:20241125-084217.png}} {{:pasted:20241125-084238.png}} {{:pasted:20241125-084251.png}} {{:pasted:20241125-084259.png}} {{:pasted:20241125-084307.png}} {{:pasted:20241125-084325.png}} {{:pasted:20241125-084337.png}} {{:pasted:20241125-084346.png}} ---- {{:pasted:20241125-084358.png}} {{:pasted:20241125-084405.png}} {{:pasted:20241125-084413.png}} {{:pasted:20241125-084421.png}} {{:pasted:20241125-084429.png}} {{:pasted:20241125-084440.png}} {{:pasted:20241125-084450.png}} {{:pasted:20241125-084501.png}} {{:pasted:20241125-084511.png}} {{:pasted:20241125-084521.png}} {{:pasted:20241125-084533.png}} {{:pasted:20241125-084541.png}} {{:pasted:20241125-084552.png}}