Here we look at understanding the differences between the clusters we identified in the previous analysis. We will attach data from the
require(data.table, quietly = TRUE)
setwd("~/Dubrovnik/")
load("APR-processed-v4.rda") ## loads d
names(d) ## print column names as a reference
## [1] "year" "EURRSD" "deflator"
## [4] "id" "capital" "costs"
## [7] "empl" "fixedAssets" "foreignOwners"
## [10] "interest" "liabLT" "liabST"
## [13] "liabSTfin" "loansLT" "revenues"
## [16] "totalAssets" "wages" "state.connection"
## [19] "state.owner.type" "state.share" "state.legal.form"
## [22] "priv.method" "reg.code" "region"
## [25] "district.code" "district" "town.code"
## [28] "town" "muni.code" "muni"
## [31] "estab.date" "removal.date" "sector.mod"
## [34] "ind.code" "trade.sector" "trade.name"
## [37] "legal.form" "status" "sitc"
## [40] "compet.cms" "market.growing" "entrepreneur"
## [43] "restructuring" "export.rank" "export"
## [46] "is.sme"
setkey(d, id, year)
We have only run the clustering analysis on SMEs, so we exclude the rest of the companies.
sum(!d$is.sme) ## how many non-SME observations are there?
## [1] 5640
d <- d[is.sme == TRUE] ## only keep those with non-NA year
We also load cluster assignment data and plot the within-cluster sum of squares to choose the number of clusters.
load("~/Dubrovnik/Clustering/APR-km-clusters-v4.rda") ## loads clusters
names(clusters)
## [1] "id" "year" "k.3" "k.4" "k.5" "k.6" "k.7" "k.8" "k.9" "k.10"
require(ggplot2, quietly = TRUE)
ggplot(data.frame(k = 2 + 1:length(within.ss), within.ss), aes(x = k, y = within.ss/1e+06)) +
geom_line(aes(group = 1)) + xlab("Number of clusters") + ylab("Within-cluster sum of squares (M)") +
ggtitle("Within-cluster SS from K-means clustering\n without location data") +
scale_x_continuous(breaks = 2 + 1:length(within.ss)) + theme_bw() + theme(panel.grid.minor.x = element_blank())
We will be using 6, 7, 8, or 10 clustersand see how they are distributed to make a judgement on how many to use.
setkey(clusters, id, year)
d <- d[clusters[, list(k.6, k.7, k.8, k.10), by = "id,year"]]
We first show a simple tabulation and histograms to understand the size of each cluster at 6, 7, and 8 clusters.
require(ggplot2, quietly = TRUE)
tt6 <- table(d$k.6)
round(100 * tt6/sum(tt6))
##
## 1 2 3 4 5 6
## 7 2 20 25 11 36
tt6 <- data.frame(tt6)
names(tt6) <- c("cluster", "count")
ggplot(tt6, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") +
geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 *
0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) +
ggtitle("Cluster size for K=6") + theme_bw()
tt7 <- table(d$k.7)
round(100 * tt7/sum(tt7))
##
## 1 2 3 4 5 6 7
## 25 32 7 1 11 21 3
tt7 <- data.frame(tt7)
names(tt7) <- c("cluster", "count")
ggplot(tt7, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") +
geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 *
0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) +
ggtitle("Cluster size for K=7") + theme_bw()
tt8 <- table(d$k.8)
round(100 * tt8/sum(tt8))
##
## 1 2 3 4 5 6 7 8
## 4 5 21 24 3 1 32 9
tt8 <- data.frame(tt8)
names(tt8) <- c("cluster", "count")
ggplot(tt8, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") +
geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 *
0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) +
ggtitle("Cluster size for K=8") + theme_bw()
tt10 <- table(d$k.10)
round(100 * tt10/sum(tt10))
##
## 1 2 3 4 5 6 7 8 9 10
## 5 1 0 3 4 19 27 14 9 19
tt10 <- data.frame(tt10)
names(tt10) <- c("cluster", "count")
ggplot(tt10, aes(x = cluster, y = count)) + geom_bar(stat = "identity", fill = "gray50") +
geom_text(aes(label = round(count/1000)), vjust = -0.4, size = 4) + scale_y_continuous(breaks = 1e+05 *
0:3, labels = 100 * 0:3) + labs(list(x = "Cluster", y = "Number of observations (in 000)")) +
ggtitle("Cluster size for K=10") + theme_bw()
From this point on, we will only look at the split by 10 clusters.
Let's make a table of cluster centroids, that is the geometrical centers (means) of the clustering variables for each cluster.
# we add 1 to denominators to avoid Inf and -Inf
d.clust <- d[, list(id, year, lrev = log(revenues + 1), empl, lassets = log(totalAssets +
1), pmargin = (revenues - costs)/(revenues + 1), lfape = log(fixedAssets +
1) - log(empl + 1))]
# add cluster labels
setkey(d.clust, id, year)
d.clust <- d.clust[clusters[, k.10, by = "id,year"]]
# summarize medians of all variables by
centroids <- d.clust[, lapply(.SD, mean), keyby = k.10, .SDcols = c("lrev",
"empl", "lassets", "pmargin", "lfape")]
round(centroids, 2)
## k.10 lrev empl lassets pmargin lfape
## 1: 1 0.65 0.06 0.84 -7.50 0.02
## 2: 2 12.44 162.40 12.89 -203.94 6.90
## 3: 3 0.00 35.58 13.12 -435740.04 10.43
## 4: 4 11.81 58.38 11.96 -32.82 6.80
## 5: 5 0.59 0.86 9.39 -1213.40 7.58
## 6: 6 7.50 1.59 6.91 -0.32 -0.51
## 7: 7 9.03 3.93 8.77 -0.13 5.37
## 8: 8 10.52 9.85 10.91 -0.40 7.55
## 9: 9 0.42 0.41 6.08 -181.49 1.17
## 10: 10 7.24 1.66 6.73 -0.32 3.57
The centroid table shows that clusters are very effective in picking up differences in the features that were used to define them.
Another odd result: the average profit margin of all clusters is negative, which seems to make no sense, because from a quick comparison we can ascertain that in most firm-year pairs revenues are higher than costs. For this reason we compute medians for each cluster, and plot the distribution densities
# summarize medians of all variables by
medians <- d.clust[, lapply(.SD, function(x) as.numeric(median(x))), keyby = k.7,
.SDcols = c("lrev", "empl", "lassets", "pmargin", "lfape")]
## Error: object 'k.7' not found
round(medians, 2)
## Error: object 'medians' not found
# plot density focusing on reasonable range of pmargin to avoid compressing
# also excluding from density computation all firms with pmargin < -10
ggplot(d.clust, aes(x = pmargin)) + geom_density(alpha = 0.6, fill = "black",
col = NA) + facet_wrap(~k.10) + coord_cartesian(xlim = c(-0.4, 0.4)) + scale_x_continuous(limits = c(-10,
1)) + # geom_vline(xintercept=0, col='white') +
theme_bw() + ggtitle("Distribution of profit margin by cluster") + xlab("Profit margin") +
ylab("Density of firm-year pairs")
## Warning: Removed 3761 rows containing non-finite values (stat_density).
## Warning: Removed 128 rows containing non-finite values (stat_density).
## Warning: Removed 24 rows containing non-finite values (stat_density).
## Warning: Removed 300 rows containing non-finite values (stat_density).
## Warning: Removed 24078 rows containing non-finite values (stat_density).
## Warning: Removed 866 rows containing non-finite values (stat_density).
## Warning: Removed 506 rows containing non-finite values (stat_density).
## Warning: Removed 900 rows containing non-finite values (stat_density).
## Warning: Removed 38142 rows containing non-finite values (stat_density).
## Warning: Removed 930 rows containing non-finite values (stat_density).
Let's compute the number of unique clusters and years for each firm.
d.consistency = d[, list(years = .N, nclust = length(unique(k.10))), by = id]
table(d.consistency$years, d.consistency$nclust)
##
## 1 2 3 4 5 6
## 1 29680 0 0 0 0 0
## 2 15442 8444 0 0 0 0
## 3 10186 9060 1627 0 0 0
## 4 7080 8321 2611 218 0 0
## 5 5208 7689 3266 528 32 0
## 6 3956 7125 3577 753 60 3
## 7 3445 6780 3584 848 96 8
## 8 14356 21381 7860 1467 179 7
We load the geographical coordinates based on municipality name (194 levels).
load("~/Dubrovnik/municipality-geocode.rda") ## loads muni.geo
names(muni.geo)
## [1] "muni" "lon" "lat"
setkey(d, muni)
setkey(muni.geo, muni)
d <- muni.geo[d]
Plot clusters on the map to see if similar firms are close to each other.
ggplot(d, aes(x = lon, y = lat, col = status), alpha = 0.7) + geom_jitter(size = 2.5) +
facet_wrap(~k.10) + theme_bw() + ggtitle("Firm location and status by cluster") +
xlab("Longitude") + ylab("Latitude") + labs(list(col = "Status"))
We can also look at the regional composition of each cluster.
ggplot(d, aes(x = region)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) +
theme_bw() + ggtitle("Firm region by cluster") + xlab("Region") + ylab("Count of firm-year pairs")
Is there a difference in the composition of firms by status based on their cluster?
ggplot(d, aes(x = status)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) +
theme_bw() + ggtitle("Firm status by cluster") + xlab("Status") + ylab("Count of firm-year pairs")
What is the sector composition of each cluster?
ggplot(d, aes(x = factor(k.10))) + geom_histogram(fill = "gray50") + facet_wrap(~sector.mod) +
theme_bw() + ggtitle("Firm sector by cluster") + xlab("Sector") + ylab("Count of firm-year pairs")
What is the state ownership status of each cluster?
ggplot(d, aes(x = state.connection)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) +
theme_bw() + ggtitle("State ownership by cluster") + xlab("State connection") +
ylab("Count of firm-year pairs")
We have identified shells and ruled out the possibility that some other firms are shells in a previous analysis. We could not determine the status of most firms. Let us see if the clusters capture firms of a certain type.
load("~/Dubrovnik/APR-shell-ID-v1.rda") ## loads shells
setkey(d, id)
# add shell firm type to main data
d <- shells[d, nomatch = 0]
ggplot(d, aes(x = firm.type)) + geom_histogram(fill = "gray50") + facet_wrap(~k.10) +
theme_bw() + ggtitle("Firm type by cluster") + xlab("Firm type") + ylab("Count of firm-year pairs")