*------------------------------------------------------------------------------------------------
* Labor and public economics 
* Fall 2006
* Assignment #1
* Good practice example
*------------------------------------------------------------------------------------------------

*------------------------------------------------------------------------------------------------
* EXERCISE No. 1
* DATA GENERATION
* from multivariate normal distribution we generate age, education, error
*------------------------------------------------------------------------------------------------
drop _all
set seed 100
matrix means=(38,12,0)
matrix devs=(13.8, 2.6, 0.1 )
matrix corrs=(1 , -0.3, 0 \ -0.3, 1,0 \ 0,0,1)

drawnorm AGE EDU e , n(1000) means(means) sds(devs) corr(corrs)

* corrections of generated sample:
	* we will work only with a sample for years 16 - 65 years
	* age and education only integers
 
	replace AGE=round(AGE)
	replace EDU=round(EDU)
	gen EXP = AGE-EDU-6
	gen EXP2 = EXP^2

	drop if AGE<16  
	drop if AGE>65 

	*we drop those who have negative number of experience and education less than 5 years
	drop if EXP<0
	drop if EDU<5

* generate log of earnings with and without heteroscedasticity
gen logY    = 0.5 + 0.08*EDU + 0.032*EXP - 0.0006*EXP2 + e 

* generate heteroscedastic errors for Exercise 3
* variance of errors is not constant, but depends on the level of earnings
gen ehet=exp(logY)*0.01*invnorm(uniform())
gen logYHET = 0.5 + 0.08*EDU + 0.032*EXP - 0.0006*EXP2 + ehet 

drop if logY<0
drop if logYHET<0

* create a sample of 200 observations
drop if _n>200

* data set description
sum AGE EDU EXP e
corr AGE EDU EXP e

*------------------------------------------------------------------------------------------------
* EXERCISE No. 2
* REGRESSIONS
*------------------------------------------------------------------------------------------------

reg logY EDU EXP EXP2
test _b[EDU]=0.08
test _b[EXP]=0.032
test _b[EXP2]=-0.0006

* test EXP of maximum earnings is 35
testnl (-0.5*_b[EXP]/_b[EXP2])=35

* test EXP of maximum earnings is as generated by the data, i.e. 26.5
testnl (-0.5*_b[EXP]/_b[EXP2])=26.5

* omitted variables
reg logY EXP EXP2
reg logY EDU EXP2
reg logY EDU EXP 

* estimation using levels of earnings
gen Y= exp(logY)
sum Y
reg Y EDU EXP EXP2


*------------------------------------------------------------------------------------------------
* EXERCISE No. 3
* REGRESSIONS WITH PECULARITIES
*------------------------------------------------------------------------------------------------


* heteroscedasticity
reg logYHET EDU EXP EXP2
gen error=logYHET-_b[EDU]*EDU-_b[EXP]*EXP -_b[EXP2]*EXP2-_b[_cons]
scatter error logYHET 
hettest
reg logYHET EDU EXP EXP2, robust

* measurement error in EDU
* we assume that older people tend to report higher education than they actualy have

set seed 500
matrix m1=(3,0)
matrix d1=(0.5, 2 )
matrix c1=(1 , 0.8 \ 0.8, 1)

drawnorm error1 error2, n(200) means(m1) sds(d1) corr(c1)
*gen EDUERR = EDU + error1

gen EDUERR = EDU 
replace EDUERR = EDU + error1 if AGE>45
replace EDUERR= round(EDUERR)

reg logY EDUERR EXP EXP2
ivreg logY EXP EXP2 ( EDUERR = AGE error2)

* measurement error in logY
gen logYERR = logY + 0.01*invnorm(uniform())
reg logYERR EDU EXP EXP2

* third order polynomial of exp
gen EXP3 = EXP2*EXP
reg logY EDU EXP EXP2 EXP3

* second order polynomial of age instead of exp
gen AGE2=AGE^2
reg logY EDU AGE AGE2


*------------------------------------------------------------------------------------------------
* EXERCISE No. 4
* METHOD OF SPLINES
*------------------------------------------------------------------------------------------------

scatter logY EXP
mkspline EXPS1 23 EXPS2 = EXP
reg logY EDU EXPS1-EXPS2

/*
*------------------------------------------------------------------------------------------------
* EXERCISE No. 5
* LOOP for regression
*------------------------------------------------------------------------------------------------

mat B=J(300,3,0)   

set output error	/* no output from regression on the screen*/
forvalues i = 1(1)300{
	set seed `i'
	set obs 1000
	drawnorm agen`i' edun`i' en`i', n(1000) means(means) sds(devs) corr(corrs) 
	replace agen`i'=round(agen`i')
	replace edun`i'=round(edun`i')
	gen expn`i'=agen`i'-edun`i'-6
	gen exp2n`i'=expn`i'^2
	drop if agen`i'<16
	drop if agen`i'>65
	drop if expn`i'<0
	drop if edun`i'<5
	gen logyn`i' = 0.5 + 0.08*edun`i'+0.032*expn`i'-0.0006*exp2n`i' + en`i'
	drop if logyn`i'<0
	drop if _n>200

	quietly reg logyn`i' edun`i' expn`i' exp2n`i'

	mat B[`i',1]=`i'           
	mat B[`i',2]=_b[edun`i']   
	mat B[`i',3]=_se[edun`i']  
	drop logyn`i' edun`i' expn`i' exp2n`i' agen`i'
}

set output proc /* end of no output from regression on the screen*/

* mat l B   
svmat B,name(bb)   
sum bb2 
*hist bb2
*kdensity bb2
*/
