Visualizing clusters of SMEs

Here we look at understanding the differences between the clusters we identified in the previous analysis. We will attach data from the

require(data.table, quietly = TRUE)
setwd("~/Dubrovnik/")
load("APR-processed-v4.rda")  ## loads d
names(d)  ## print column names as a reference
##  [1] "year"             "EURRSD"           "deflator"        
##  [4] "id"               "capital"          "costs"           
##  [7] "empl"             "fixedAssets"      "foreignOwners"   
## [10] "interest"         "liabLT"           "liabST"          
## [13] "liabSTfin"        "loansLT"          "revenues"        
## [16] "totalAssets"      "wages"            "state.connection"
## [19] "state.owner.type" "state.share"      "state.legal.form"
## [22] "priv.method"      "reg.code"         "region"          
## [25] "district.code"    "district"         "town.code"       
## [28] "town"             "muni.code"        "muni"            
## [31] "estab.date"       "removal.date"     "sector.mod"      
## [34] "ind.code"         "trade.sector"     "trade.name"      
## [37] "legal.form"       "status"           "sitc"            
## [40] "compet.cms"       "market.growing"   "entrepreneur"    
## [43] "restructuring"    "export.rank"      "export"          
## [46] "is.sme"
setkey(d, id, year)

We have only run the clustering analysis on SMEs, so we exclude the rest of the companies.

sum(!d$is.sme)  ## how many non-SME observations are there?
## [1] 5640
d <- d[is.sme == TRUE]  ## only keep those with non-NA year

We also load cluster assignment data and plot the within-cluster sum of squares to choose the number of clusters.

load("~/Dubrovnik/Clustering/APR-km-clusters-v4.rda")  ## loads clusters
names(clusters)
##  [1] "id"   "year" "k.3"  "k.4"  "k.5"  "k.6"  "k.7"  "k.8"  "k.9"  "k.10"

require(ggplot2, quietly = TRUE)
ggplot(data.frame(k = 2 + 1:length(within.ss), within.ss), aes(x = k, y = within.ss/1e+06)) + 
    geom_line(aes(group = 1)) + xlab("Number of clusters") + ylab("Within-cluster sum of squares (M)") + 
    ggtitle("Within-cluster SS from K-means clustering\n          without location data") + 
    scale_x_continuous(breaks = 2 + 1:length(within.ss)) + theme_bw() + theme(panel.grid.minor.x = element_blank())

plot of chunk unnamed-chunk-3

We will be using 6, 7, 8, or 10 clustersand see how they are distributed to make a judgement on how many to use.

setkey(clusters, id, year)
d <- d[clusters[, list(k.6, k.7, k.8, k.10), by = "id,year"]]

How are the clusters distributed?

We first show a simple tabulation and histograms to understand the size of each cluster at 6, 7, and 8 clusters.

require(ggplot2, quietly = TRUE)
tt6 <- table(d$k.6)
round(100 * tt6/sum(tt6))
## 
##  1  2  3  4  5  6 
##  7  2 20 25 11 36

tt6 <- data.frame(tt6)
names(tt6) <- c("cluster", "count")
ggplot(tt6, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") + 
    geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 * 
    0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) + 
    ggtitle("Cluster size for K=6") + theme_bw()

plot of chunk unnamed-chunk-5


tt7 <- table(d$k.7)
round(100 * tt7/sum(tt7))
## 
##  1  2  3  4  5  6  7 
## 25 32  7  1 11 21  3
tt7 <- data.frame(tt7)
names(tt7) <- c("cluster", "count")
ggplot(tt7, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") + 
    geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 * 
    0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) + 
    ggtitle("Cluster size for K=7") + theme_bw()

plot of chunk unnamed-chunk-5


tt8 <- table(d$k.8)
round(100 * tt8/sum(tt8))
## 
##  1  2  3  4  5  6  7  8 
##  4  5 21 24  3  1 32  9
tt8 <- data.frame(tt8)
names(tt8) <- c("cluster", "count")
ggplot(tt8, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") + 
    geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 * 
    0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) + 
    ggtitle("Cluster size for K=8") + theme_bw()

plot of chunk unnamed-chunk-5


tt10 <- table(d$k.10)
round(100 * tt10/sum(tt10))
## 
##  1  2  3  4  5  6  7  8  9 10 
##  5  1  0  3  4 19 27 14  9 19
tt10 <- data.frame(tt10)
names(tt10) <- c("cluster", "count")
ggplot(tt10, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") + 
    geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 * 
    0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) + 
    ggtitle("Cluster size for K=10") + theme_bw()

plot of chunk unnamed-chunk-5

From this point on, we will only look at the split by 10 clusters.

How different are firms by clustering feature?

Let's make a table of cluster centroids, that is the geometrical centers (means) of the clustering variables for each cluster.

# we add 1 to denominators to avoid Inf and -Inf
d.clust <- d[, list(id, year, lrev = log(revenues + 1), empl, lassets = log(totalAssets + 
    1), pmargin = (revenues - costs)/(revenues + 1), lfape = log(fixedAssets + 
    1) - log(empl + 1))]

# add cluster labels
setkey(d.clust, id, year)
d.clust <- d.clust[clusters[, k.10, by = "id,year"]]


# summarize medians of all variables by
centroids <- d.clust[, lapply(.SD, mean), keyby = k.10, .SDcols = c("lrev", 
    "empl", "lassets", "pmargin", "lfape")]
round(centroids, 2)
##     k.10  lrev   empl lassets    pmargin lfape
##  1:    1  0.65   0.06    0.84      -7.50  0.02
##  2:    2 12.44 162.40   12.89    -203.94  6.90
##  3:    3  0.00  35.58   13.12 -435740.04 10.43
##  4:    4 11.81  58.38   11.96     -32.82  6.80
##  5:    5  0.59   0.86    9.39   -1213.40  7.58
##  6:    6  7.50   1.59    6.91      -0.32 -0.51
##  7:    7  9.03   3.93    8.77      -0.13  5.37
##  8:    8 10.52   9.85   10.91      -0.40  7.55
##  9:    9  0.42   0.41    6.08    -181.49  1.17
## 10:   10  7.24   1.66    6.73      -0.32  3.57

The centroid table shows that clusters are very effective in picking up differences in the features that were used to define them.

Another odd result: the average profit margin of all clusters is negative, which seems to make no sense, because from a quick comparison we can ascertain that in most firm-year pairs revenues are higher than costs. For this reason we compute medians for each cluster, and plot the distribution densities

# summarize medians of all variables by
medians <- d.clust[, lapply(.SD, function(x) as.numeric(median(x))), keyby = k.7, 
    .SDcols = c("lrev", "empl", "lassets", "pmargin", "lfape")]
## Error: object 'k.7' not found
round(medians, 2)
## Error: object 'medians' not found

# plot density focusing on reasonable range of pmargin to avoid compressing
# also excluding from density computation all firms with pmargin < -10
ggplot(d.clust, aes(x = pmargin)) + geom_density(alpha = 0.6, fill = "black", 
    col = NA) + facet_wrap(~k.10) + coord_cartesian(xlim = c(-0.4, 0.4)) + scale_x_continuous(limits = c(-10, 
    1)) + # geom_vline(xintercept=0, col='white') +
theme_bw() + ggtitle("Distribution of profit margin by cluster") + xlab("Profit margin") + 
    ylab("Density of firm-year pairs")
## Warning: Removed 3761 rows containing non-finite values (stat_density).
## Warning: Removed 128 rows containing non-finite values (stat_density).
## Warning: Removed 24 rows containing non-finite values (stat_density).
## Warning: Removed 300 rows containing non-finite values (stat_density).
## Warning: Removed 24078 rows containing non-finite values (stat_density).
## Warning: Removed 866 rows containing non-finite values (stat_density).
## Warning: Removed 506 rows containing non-finite values (stat_density).
## Warning: Removed 900 rows containing non-finite values (stat_density).
## Warning: Removed 38142 rows containing non-finite values (stat_density).
## Warning: Removed 930 rows containing non-finite values (stat_density).

plot of chunk unnamed-chunk-7

How consistent are clusters throughout the years?

Let's compute the number of unique clusters and years for each firm.

d.consistency = d[, list(years = .N, nclust = length(unique(k.10))), by = id]

table(d.consistency$years, d.consistency$nclust)
##    
##         1     2     3     4     5     6
##   1 29680     0     0     0     0     0
##   2 15442  8444     0     0     0     0
##   3 10186  9060  1627     0     0     0
##   4  7080  8321  2611   218     0     0
##   5  5208  7689  3266   528    32     0
##   6  3956  7125  3577   753    60     3
##   7  3445  6780  3584   848    96     8
##   8 14356 21381  7860  1467   179     7

How are clusters geographically distributed?

We load the geographical coordinates based on municipality name (194 levels).

load("~/Dubrovnik/municipality-geocode.rda")  ## loads muni.geo
names(muni.geo)
## [1] "muni" "lon"  "lat"
setkey(d, muni)
setkey(muni.geo, muni)
d <- muni.geo[d]

Plot clusters on the map to see if similar firms are close to each other.

ggplot(d, aes(x = lon, y = lat, col = status), alpha = 0.7) + geom_jitter(size = 2.5) + 
    facet_wrap(~k.10) + theme_bw() + ggtitle("Firm location and status by cluster") + 
    xlab("Longitude") + ylab("Latitude") + labs(list(col = "Status"))

plot of chunk unnamed-chunk-10

We can also look at the regional composition of each cluster.

ggplot(d, aes(x = region)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) + 
    theme_bw() + ggtitle("Firm region by cluster") + xlab("Region") + ylab("Count of firm-year pairs")

plot of chunk unnamed-chunk-11

How are other variables distributed among clusters?

Is there a difference in the composition of firms by status based on their cluster?

ggplot(d, aes(x = status)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) + 
    theme_bw() + ggtitle("Firm status by cluster") + xlab("Status") + ylab("Count of firm-year pairs")

plot of chunk unnamed-chunk-12

What is the sector composition of each cluster?

ggplot(d, aes(x = factor(k.10))) + geom_histogram(fill = "gray50") + facet_wrap(~sector.mod) + 
    theme_bw() + ggtitle("Firm sector by cluster") + xlab("Sector") + ylab("Count of firm-year pairs")

plot of chunk unnamed-chunk-13

What is the state ownership status of each cluster?

ggplot(d, aes(x = state.connection)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) + 
    theme_bw() + ggtitle("State ownership by cluster") + xlab("State connection") + 
    ylab("Count of firm-year pairs")

plot of chunk unnamed-chunk-14

Do clusters correlate to identified shells?

We have identified shells and ruled out the possibility that some other firms are shells in a previous analysis. We could not determine the status of most firms. Let us see if the clusters capture firms of a certain type.

load("~/Dubrovnik/APR-shell-ID-v1.rda")  ## loads shells
setkey(d, id)
# add shell firm type to main data
d <- shells[d, nomatch = 0]

ggplot(d, aes(x = firm.type)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) + 
    theme_bw() + ggtitle("Firm type by cluster") + xlab("Firm type") + ylab("Count of firm-year pairs")

plot of chunk unnamed-chunk-15