Mapping missing variables in the APR data

Here we try to understand the availability of key year-level data for firms in the APR dataset.

require(data.table)

## Loading required package: data.table

setwd("~/Dubrovnik/")
load("APR-processed-stage1-v3.rda")  ## load d, the main data
setkey(d, id)
names(d)  ## list of columns available

##  [1] "year"             "deflator"         "id"              
##  [4] "capital"          "costs"            "empl"            
##  [7] "fixedAssets"      "foreignOwners"    "interest"        
## [10] "liabLT"           "liabST"           "liabSTfin"       
## [13] "loansLT"          "revenues"         "totalAssets"     
## [16] "wages"            "state.connection" "state.owner.type"
## [19] "state.share"      "state.legal.form" "priv.method"     
## [22] "reg.code"         "region"           "district.code"   
## [25] "district"         "town.code"        "town"            
## [28] "muni.code"        "muni"             "estab.date"      
## [31] "removal.date"     "sector.mod"       "ind.code"        
## [34] "trade.sector"     "trade.name"       "legal.form"      
## [37] "status"           "sitc"             "compet.cms"      
## [40] "market.growing"   "entrepreneur"     "restructuring"   
## [43] "export.rank"      "export"

Prior to this analysis, the data has been read, formatted, transformed from wide (one row, one firm) to long format (one row, one firm for one year) using the script clean-data-v1.R. In this process, firms with no reported data in any year have a single row, and are missing the year column.

We need to define several levels of missing values: 1. The firm has no data because it did not exist, or was erased from the register before year end 2. The firm should have data, but has not reported anything (the entire row is missing) 3. The firm has an entry for that year (they reported), but the variable in question is missing (e.g. revenues, or labor costs) 4. The data is not missing

Determine years when firm should have data

To establish when we should expect data for a give firm we start by creating a full grid of values, that is, an index of all firms by 8 years (2005 to 2012).

d.all <- data.table(expand.grid(id = unique(d[, id]), year = 2005:2012))
setkey(d.all, id)  ## index by ID, just like main data

We then add the first and last year of when each firm existed according to estab.date and removal.date in our main data d.

d.all <- d[, list(estab.date = estab.date[1], removal.date = removal.date[1]), 
    by = id][d.all]
d.all[, `:=`(first.year, pmax(year(estab.date), 2005))]
d.all[, `:=`(last.year, pmin(year(removal.date), 2013))]
d.all[is.na(last.year), `:=`(last.year, 2013)]  ## for firms that are still running
d.all[last.year < 2005, `:=`(last.year, 2005)]  ## should not happen

Next, we drop those index values that should not exist:

years before establishment
years after removal (including year of removal, as firms report at the end)

d.all <- d.all[year >= first.year]
d.all <- d.all[year < last.year]

Create dataset with state of each variable

We now add whether specific columns have a non-NA value, and code the columns based on the classification of missing outlined above:

Should not exist: not in our dataset
Should exist but row is missing: 0
Row exists but this variable missing: 1
Variable is not missing: 2

We do this for revenues, total costs, assets, employee number, wages, and interest.

# setup simple variable on whether revenues are recorded (1-NO, 2-YES)
# without those that don't have any years
d.miss <- d[!is.na(year), list(id, year, has.revenue = 1 + (!is.na(revenues)), 
    has.cost = 1 + (!is.na(costs)), has.asset = 1 + (!is.na(totalAssets)), has.empl = 1 + 
        (!is.na(empl)), has.wage = 1 + (!is.na(wages)), has.interest = 1 + (!is.na(interest)))]
setkey(d.miss, id, year)
# merge with d.all
setkey(d.all, id, year)
d.all <- d.miss[d.all]
d.all[, `:=`(removal.date, NULL)]  ## drop removal.date because of NAs
# replace NAs with 0 (code for: should exist, but doesn't)
sum(is.na(d.all))
d.all[is.na(d.all)] <- 0

This is how the status dataset looks:

d.all

##                id year has.revenue has.cost has.asset has.empl has.wage
##       1: 17000012 2005           2        2         2        2        2
##       2: 17000012 2006           2        2         2        2        2
##       3: 17000012 2007           2        2         2        2        2
##       4: 17000012 2008           2        2         2        2        2
##       5: 17000012 2009           2        2         2        2        2
##      ---                                                               
## 1030914:  9342222 2008           2        2         2        2        2
## 1030915:  9342222 2009           2        2         2        2        2
## 1030916:  9342222 2010           2        2         2        2        2
## 1030917:  9342222 2011           2        2         2        2        2
## 1030918:  9342222 2012           2        2         2        2        2
##          has.interest estab.date first.year last.year
##       1:            1 1994-06-30       2005      2013
##       2:            2 1994-06-30       2005      2013
##       3:            2 1994-06-30       2005      2013
##       4:            2 1994-06-30       2005      2013
##       5:            2 1994-06-30       2005      2013
##      ---                                             
## 1030914:            2 2004-12-08       2005      2013
## 1030915:            2 2004-12-08       2005      2013
## 1030916:            2 2004-12-08       2005      2013
## 1030917:            2 2004-12-08       2005      2013
## 1030918:            2 2004-12-08       2005      2013

Plots of variable status

Plotting all 190 thousand firms would be impractical, so we opted to show the status of a radom subset of 2000 firms. The resulting plots are compact but legible.

# use acast to set it into a matrix and use image() to wrap long ribbon into
# several strips
require(reshape2)

## Loading required package: reshape2

setkey(d.all, id)
set.seed(2014)
i <- sample(unique(d.all[, id]), size = 2000)
a.miss <- acast(d.all[i, list(year, has.revenue)], id ~ year, value.var = "has.revenue", 
    fill = NA)

# create sorting index for better clustering of similar firms
s <- order(rowSums(!is.na(a.miss)), a.miss[, 1], a.miss[, 2], a.miss[, 3], a.miss[, 
    4], a.miss[, 5], a.miss[, 6], a.miss[, 7], a.miss[, 8], decreasing = TRUE)
a.miss <- a.miss[s, ]

Revenues

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Revenue data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "revenue missing", 
    "not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", 
    ncol = 2)

plot of chunk unnamed-chunk-8

Costs

a.miss <- acast(d.all[i, list(year, has.cost)], id ~ year, value.var = "has.cost", 
    fill = NA)
a.miss <- a.miss[s, ]

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Cost data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "cost missing", "not missing"), 
    col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", ncol = 2)

plot of chunk unnamed-chunk-9

Assets

a.miss <- acast(d.all[i, list(year, has.asset)], id ~ year, value.var = "has.asset", 
    fill = NA)
a.miss <- a.miss[s, ]

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Asset data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "assets missing", 
    "not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", 
    ncol = 2)

plot of chunk unnamed-chunk-10

Employee number

a.miss <- acast(d.all[i, list(year, has.empl)], id ~ year, value.var = "has.empl", 
    fill = NA)
a.miss <- a.miss[s, ]

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Employee data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "employee # missing", 
    "not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", 
    ncol = 2)

plot of chunk unnamed-chunk-11

Wages

a.miss <- acast(d.all[i, list(year, has.wage)], id ~ year, value.var = "has.wage", 
    fill = NA)
a.miss <- a.miss[s, ]

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Wage data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "wages missing", "not missing"), 
    col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", ncol = 2)

plot of chunk unnamed-chunk-12

Interest payment

a.miss <- acast(d.all[i, list(year, has.interest)], id ~ year, value.var = "has.interest", 
    fill = NA)
a.miss <- a.miss[s, ]

par(mar = c(3, 0.7, 0.7, 0.7))
par(oma = c(1, 2, 3, 1))
par(mfrow = c(1, 10))
for (j in 1:10) {
    from <- 200 * (j - 1) + 1
    to <- 200 * j
    year.labels <- paste0(rep(c("0", ""), c(5, 3)), 5:12)
    image(t(a.miss[to:from, ]), axes = FALSE, breaks = c(-1, 0, 1, 2), col = c("grey", 
        "gold", "brown"))
    abline(h = seq(19, 179, by = 20)/199 - 1/398, col = "white", lwd = 1)
    abline(v = (1:7)/7 - 1/14, col = "white", lwd = 1)
    axis(side = 1, at = seq(0, 1, length.out = 8), labels = year.labels, cex.axis = 1, 
        tcl = -0.3, padj = -0.8)
}
title(main = "Interest data availability for 2000 random firms", outer = TRUE)
mtext(text = "Firm", side = 2, outer = TRUE)
mtext(text = "Year", side = 1, outer = TRUE)
par(usr = c(0, 1, 0, 1), xpd = NA)  #set coordinates and allow outside plotting
legend(x = -2.1, y = 1.06, legend = c("all year missing", "interest missing", 
    "not missing"), col = c("grey", "gold", "brown"), pch = 15, cex = 1, bty = "n", 
    ncol = 2)

plot of chunk unnamed-chunk-13