Here we try to understand the availability of key year-level data for firms in the APR dataset.
require(data.table)
## Loading required package: data.table
setwd("~/Dubrovnik/")
load("APR-processed-stage1-v3.rda") ## load d, the main data
setkey(d, id)
names(d) ## list of columns available
## [1] "year" "deflator" "id"
## [4] "capital" "costs" "empl"
## [7] "fixedAssets" "foreignOwners" "interest"
## [10] "liabLT" "liabST" "liabSTfin"
## [13] "loansLT" "revenues" "totalAssets"
## [16] "wages" "state.connection" "state.owner.type"
## [19] "state.share" "state.legal.form" "priv.method"
## [22] "reg.code" "region" "district.code"
## [25] "district" "town.code" "town"
## [28] "muni.code" "muni" "estab.date"
## [31] "removal.date" "sector.mod" "ind.code"
## [34] "trade.sector" "trade.name" "legal.form"
## [37] "status" "sitc" "compet.cms"
## [40] "market.growing" "entrepreneur" "restructuring"
## [43] "export.rank" "export"
Prior to this analysis, the data has been read, formatted, transformed from
wide (one row, one firm) to long format (one row, one firm for one year)
using the script clean-data-v1.R
. In this process, firms with no reported
data in any year have a single row, and are missing the year
column.
We need to define several levels of missing values: 1. The firm has no data because it did not exist, or was erased from the register before year end 2. The firm should have data, but has not reported anything (the entire row is missing) 3. The firm has an entry for that year (they reported), but the variable in question is missing (e.g. revenues, or labor costs) 4. The data is not missing
To establish when we should expect data for a give firm we start by creating a full grid of values, that is, an index of all firms by 8 years (2005 to 2012).
d.all <- data.table(expand.grid(id = unique(d[, id]), year = 2005:2012))
setkey(d.all, id) ## index by ID, just like main data
We then add the first and last year of when each firm existed according to estab.date
and removal.date
in our main data d
.
d.all <- d[, list(estab.date = estab.date[1], removal.date = removal.date[1]),
by = id][d.all]
d.all[, `:=`(first.year, pmax(year(estab.date), 2005))]
d.all[, `:=`(last.year, pmin(year(removal.date), 2013))]
d.all[is.na(last.year), `:=`(last.year, 2013)] ## for firms that are still running
d.all[last.year < 2005, `:=`(last.year, 2005)] ## should not happen
Next, we drop those index values that should not exist:
d.all <- d.all[year >= first.year]
d.all <- d.all[year < last.year]
We now add whether specific columns have a non-NA value, and code the columns based on the classification of missing outlined above:
We do this for revenues, total costs, assets, employee number, wages, and interest.
# setup simple variable on whether revenues are recorded (1-NO, 2-YES)
# without those that don't have any years
d.miss <- d[!is.na(year), list(id, year, has.revenue = 1 + (!is.na(revenues)),
has.cost = 1 + (!is.na(costs)), has.asset = 1 + (!is.na(totalAssets)), has.empl = 1 +
(!is.na(empl)), has.wage = 1 + (!is.na(wages)), has.interest = 1 + (!is.na(interest)))]
setkey(d.miss, id, year)
# merge with d.all
setkey(d.all, id, year)
d.all <- d.miss[d.all]
d.all[, `:=`(removal.date, NULL)] ## drop removal.date because of NAs
# replace NAs with 0 (code for: should exist, but doesn't)
sum(is.na(d.all))
d.all[is.na(d.all)] <- 0
This is how the status dataset looks:
d.all
## id year has.revenue has.cost has.asset has.empl has.wage
## 1: 17000012 2005 2 2 2 2 2
## 2: 17000012 2006 2 2 2 2 2
## 3: 17000012 2007 2 2 2 2 2
## 4: 17000012 2008 2 2 2 2 2
## 5: 17000012 2009 2 2 2 2 2
## ---
## 1030914: 9342222 2008 2 2 2 2 2
## 1030915: 9342222 2009 2 2 2 2 2
## 1030916: 9342222 2010 2 2 2 2 2
## 1030917: 9342222 2011 2 2 2 2 2
## 1030918: 9342222 2012 2 2 2 2 2
## has.interest estab.date first.year last.year
## 1: 1 1994-06-30 2005 2013
## 2: 2 1994-06-30 2005 2013
## 3: 2 1994-06-30 2005 2013
## 4: 2 1994-06-30 2005 2013
## 5: 2 1994-06-30 2005 2013
## ---
## 1030914: 2 2004-12-08 2005 2013
## 1030915: 2 2004-12-08 2005 2013
## 1030916: 2 2004-12-08 2005 2013
## 1030917: 2 2004-12-08 2005 2013
## 1030918: 2 2004-12-08 2005 2013
Plotting all 190 thousand firms would be impractical, so we opted to show the status of a radom subset of 2000 firms. The resulting plots are compact but legible.
# use acast to set it into a matrix and use image() to wrap long ribbon into
# several strips
require(reshape2)
## Loading required package: reshape2
setkey(d.all, id)
set.seed(2014)
i <- sample(unique(d.all[, id]), size = 2000)
a.miss <- acast(d.all[i, list(year, has.revenue)], id ~ year, value.var = "has.revenue",
fill = NA)
# create sorting index for better clustering of similar firms
s <- order(rowSums(!is.na(a.miss)), a.miss[, 1], a.miss[, 2], a.miss[, 3], a.miss[,
4], a.miss[, 5], a.miss[, 6], a.miss[, 7], a.miss[, 8], decreasing = TRUE)
a.miss <- a.miss[s, ]
Revenues
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Revenue data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "revenue missing",
"not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n",
ncol = 2)
Costs
a.miss <- acast(d.all[i, list(year, has.cost)], id ~ year, value.var = "has.cost",
fill = NA)
a.miss <- a.miss[s, ]
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Cost data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "cost missing", "not missing"),
col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", ncol = 2)
Assets
a.miss <- acast(d.all[i, list(year, has.asset)], id ~ year, value.var = "has.asset",
fill = NA)
a.miss <- a.miss[s, ]
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Asset data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "assets missing",
"not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n",
ncol = 2)
Employee number
a.miss <- acast(d.all[i, list(year, has.empl)], id ~ year, value.var = "has.empl",
fill = NA)
a.miss <- a.miss[s, ]
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Employee data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "employee # missing",
"not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n",
ncol = 2)
Wages
a.miss <- acast(d.all[i, list(year, has.wage)], id ~ year, value.var = "has.wage",
fill = NA)
a.miss <- a.miss[s, ]
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Wage data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "wages missing", "not missing"),
col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", ncol = 2)
Interest payment
a.miss <- acast(d.all[i, list(year, has.interest)], id ~ year, value.var = "has.interest",
fill = NA)
a.miss <- a.miss[s, ]
par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
from <- 200 * (j - 1) + 1
to <- 200 * j
year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey",
"gold", "brown"))
abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1,
tcl = -0.3, padj = -0.8)
}
title(main = "Interest data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA) #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "interest missing",
"not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n",
ncol = 2)