# Create 10,000 true IQ scores, 20,000 error scores, and two sets of 10,000 observed scores (mean=100, SD=15, reliability=0.8) from them and plot the results. set.seed(243) true_score <- round(rnorm(n=10000,m=100,sd=13.42),digits=0) error_score1 <- round(rnorm(n=10000,m=0,sd=6.71),digits=1) error_score2 <- round(rnorm(n=10000,m=0,sd=6.71),digits=1) observed_score1 <- round(true_score + error_score1,digits=0) observed_score2 <- round(true_score + error_score2,digits=0) hist(true_score, breaks=100, col="seagreen", main="Histogram of true scores\nN=10,000", xlab="True score",xaxt='n',ylim=c(0,350)) axis_labels<-c(40,50,60,70,80,90,100,110,120,130,140,150,160) axis(side=1, at=seq(40,160,10), labels=axis_labels) hist(error_score1, breaks=50, col="red", main="Histogram of error scores\nN=10,000", xlab="Error score", xaxt='n') axis(side=1, at=seq(-30,30, 5)) hist(observed_score1, breaks=100, col="slategrey", main="Histogram of observed scores\nN=10,000", xlab="Observed IQ score",xaxt='n',ylim=c(0,300)) axis_labels<-c(40,50,60,70,80,90,100,110,120,130,140,150,160) axis(side=1, at=seq(40,160,10), labels=axis_labels) # Regress retest scores on original scores and plot the results. regr1=lm(observed_score2~observed_score1) summary(regr1) axis_labels<-c(50,60,70,80,90,100,110,120,130,140,150,160) plot(observed_score1,observed_score2,main="Test and retest, N=10,000\nMean=100, SD=15, reliability=0.80", xlab="Observed score, first test", ylab="Observed score, second test",pch=15, col="slategrey") abline(regr1,col="red") axis(2,at=axis_labels, labels=axis_labels) axis(1,at=axis_labels, labels=axis_labels) intercept<-summary(regr1)$coefficients[1] slope<-summary(regr1)$coefficients[2] segments(x0=60,y0=0,x1=60,y1=intercept+slope*60, col="blue") arrows(x0=60,y0=intercept+slope*60,x1=38,y1=intercept+slope*60, length = 0.20, col="blue") segments(x0=80,y0=0,x1=80,y1=intercept+slope*80, col="blue") arrows(x0=80,y0=intercept+slope*80,x1=38,y1=intercept+slope*80, length = 0.20, col="blue") segments(x0=120,y0=0,x1=120,y1=intercept+slope*120, col="blue") arrows(x0=120,y0=intercept+slope*120,x1=38,y1=intercept+slope*120, length = 0.20, col="blue") segments(x0=100,y0=0,x1=100,y1=intercept+slope*100, col="blue") arrows(x0=100,y0=intercept+slope*100,x1=38,y1=intercept+slope*100, length = 0.20, col="blue") segments(x0=140,y0=0,x1=140,y1=intercept+slope*140, col="blue") arrows(x0=140,y0=intercept+slope*140,x1=38,y1=intercept+slope*140, length = 0.20, col="blue") # Select those with IQ=130 from the simulated data and writes out their various scores as well as means for all scores in a csv file. IQ_data<-cbind(observed_score1,observed_score2,error_score1,error_score2,true_score) IQ_data_selected<-subset(IQ_data,observed_score1==130,select=observed_score1:true_score) IQ_data_selected<-rbind(IQ_data_selected,round(colMeans(IQ_data_selected),digits=1)) write.table(IQ_data_selected,"IQ_130.csv") # Plot the distributions of true and observed scores from the simulation. hist(true_score,breaks=100,col=rgb(0,0,1,0.6), main="Histogram of true and observed scores\nTrue scores=blue, observed scores=red, overlapping=purple",xlab="Score",xaxt='n') hist(observed_score1,breaks=100,add=T, col=rgb(1,0,0,0.5),xaxt='n') axis_labels<-c(40,50,60,70,80,90,100,110,120,130,140,150,160) axis(side=1, at=seq(40,160,10), labels=axis_labels) # Regress original scores on retest scores and plot the results. regr2 <- lm(observed_score1~observed_score2) plot(observed_score2,observed_score1,main="Predicting original scores from retest scores, N=10,000\nMean=100, SD=15, reliability=0.80", xlab="Observed score, second test", ylab="Observed score, first test",pch=15, col="slategrey") abline(regr2,col="red") axis(2,at=axis_labels, labels=axis_labels) axis(1,at=axis_labels, labels=axis_labels) intercept2<-summary(regr2)$coefficients[1] slope2<-summary(regr2)$coefficients[2] arrows(x1=60,y1=38,x0=60,y0=intercept2+slope2*60, length = 0.20, col="blue") segments(x1=60,y1=intercept2+slope2*60,x0=38,y0=intercept2+slope2*60, col="blue") arrows(x1=80,y1=38,x0=80,y0=intercept2+slope2*80, length = 0.20, col="blue") segments(x0=80,y0=intercept2+slope2*80,x1=38,y1=intercept2+slope2*80, col="blue") arrows(x1=120,y1=38,x0=120,y0=intercept2+slope2*120, length = 0.20,col="blue") segments(x0=120,y0=intercept2+slope2*120,x1=38,y1=intercept2+slope2*120, col="blue") arrows(x1=100,y1=38,x0=100,y0=intercept2+slope2*100, length = 0.20,col="blue") segments(x0=100,y0=intercept2+slope2*100,x1=38,y1=intercept2+slope2*100, col="blue") arrows(x1=140,y1=38,x0=140,y0=intercept2+slope2*140, length = 0.20,col="blue") segments(x0=140,y0=intercept2+slope2*140,x1=38,y1=intercept2+slope2*140, col="blue") # Calculate the averages of the two sets of observed scores and compute the reliability of these sum scores by calculating the squared correlation between them and true scores. obs_sum<-(observed_score1+observed_score2)/2 cor(obs_sum,true_score)^2 # Select those with certain true scores from the simulated data and write out various statistics about them in a csv file. IQ_data<-cbind(observed_score1,observed_score2,error_score1,error_score2,true_score) IQ_data_60<-subset(IQ_data,true_score==60,select=c(true_score, error_score1)) IQ_data_70<-subset(IQ_data,true_score==70,select=c(true_score, error_score1)) IQ_data_80<-subset(IQ_data,true_score==80,select=c(true_score, error_score2)) IQ_data_90<-subset(IQ_data,true_score==90,select=c(true_score, error_score1)) IQ_data_100<-subset(IQ_data,true_score==100,select=c(true_score, error_score2)) IQ_data_110<-subset(IQ_data,true_score==110,select=c(true_score, error_score1)) IQ_data_120<-subset(IQ_data,true_score==120,select=c(true_score, error_score2)) IQ_data_130<-subset(IQ_data,true_score==130,select=c(true_score, error_score1)) IQ_data_140<-subset(IQ_data,true_score==140,select=c(true_score, error_score1)) true_and_error<-do.call("rbind", list(round(colMeans(IQ_data_60),1),round(colMeans(IQ_data_70),1),round(colMeans(IQ_data_80),1), round(colMeans(IQ_data_90),1), round(colMeans(IQ_data_100),1),round(colMeans(IQ_data_110),1),round(colMeans(IQ_data_120),1), round(colMeans(IQ_data_130),1),round(colMeans(IQ_data_140),1))) standard_devs<-do.call("rbind", list(round(sd(IQ_data_60[,2]),2),round(sd(IQ_data_70[,2]),2),round(sd(IQ_data_80[,2]),2),round(sd(IQ_data_90[,2]),2),round(sd(IQ_data_100[,2]),2),round(sd(IQ_data_110[,2]),2),round(sd(IQ_data_120[,2]),2),round(sd(IQ_data_130[,2]),2),round(sd(IQ_data_140[,2]),2))) Ns<-do.call("rbind", list(nrow(IQ_data_60),nrow(IQ_data_70),nrow(IQ_data_80),nrow(IQ_data_90),nrow(IQ_data_100),nrow(IQ_data_110),nrow(IQ_data_120),nrow(IQ_data_130),nrow(IQ_data_140))) true_and_error<-cbind(true_and_error,standard_devs) true_and_error<-cbind(true_and_error,Ns) colnames(true_and_error)<-c("True score","Mean error","SD of error","N") write.table(true_and_error,"true_and_error_scores.csv",row.names = FALSE) # Select those with certain observed scores from the simulated data and write out various statistics about them in a csv file. IQ_data<-cbind(observed_score1,observed_score2,error_score1,error_score2,true_score) IQ_data_60<-subset(IQ_data,observed_score1==60,select=c(observed_score1, error_score1)) IQ_data_70<-subset(IQ_data,observed_score1==70,select=c(observed_score1, error_score1)) IQ_data_80<-subset(IQ_data,observed_score1==80,select=c(observed_score1, error_score1)) IQ_data_90<-subset(IQ_data,observed_score1==90,select=c(observed_score1, error_score1)) IQ_data_100<-subset(IQ_data,observed_score1==100,select=c(observed_score1, error_score1)) IQ_data_110<-subset(IQ_data,observed_score1==110,select=c(observed_score1, error_score1)) IQ_data_120<-subset(IQ_data,observed_score1==120,select=c(observed_score1, error_score1)) IQ_data_130<-subset(IQ_data,observed_score1==130,select=c(observed_score1, error_score1)) IQ_data_140<-subset(IQ_data,observed_score1==140,select=c(observed_score1, error_score1)) observed_and_error<-do.call("rbind", list(round(colMeans(IQ_data_60),1),round(colMeans(IQ_data_70),1),round(colMeans(IQ_data_80),1), round(colMeans(IQ_data_90),1), round(colMeans(IQ_data_100),1),round(colMeans(IQ_data_110),1),round(colMeans(IQ_data_120),1), round(colMeans(IQ_data_130),1),round(colMeans(IQ_data_140),1))) standard_devs<-do.call("rbind", list(round(sd(IQ_data_60[,2]),2),round(sd(IQ_data_70[,2]),2),round(sd(IQ_data_80[,2]),2),round(sd(IQ_data_90[,2]),2),round(sd(IQ_data_100[,2]),2),round(sd(IQ_data_110[,2]),2),round(sd(IQ_data_120[,2]),2),round(sd(IQ_data_130[,2]),2),round(sd(IQ_data_140[,2]),2))) Ns<-do.call("rbind", list(nrow(IQ_data_60),nrow(IQ_data_70),nrow(IQ_data_80),nrow(IQ_data_90),nrow(IQ_data_100),nrow(IQ_data_110),nrow(IQ_data_120),nrow(IQ_data_130),nrow(IQ_data_140))) observed_and_error<-cbind(observed_and_error,standard_devs) observed_and_error<-cbind(observed_and_error,Ns) colnames(observed_and_error)<-c("Observed score","Mean error","SD of error","N") write.table(observed_and_error,"observed_and_error_scores.csv",row.names = FALSE) # Plot observed scores against error scores with a regression line. axis_labels_x<-c(40,50,60,70,80,90,100,110,120,130,140,150,160) axis_labels_y<-c(-30,-25,-20,-15,-10,-5,0,5,10,15,20,25,30) plot(observed_score1,error_score1,main="The relationship between observed and error scores\nN=10,000, reliability=0.80", xlab="Observed score", ylab="Error score",pch=15,col="slategrey") abline(lm(error_score1~observed_score1),col="red") axis(1,at=axis_labels_x, labels=axis_labels_x) axis(2,at=axis_labels_y, labels=axis_labels_y) # Calculate true score estimates and compare their prediction error to that of observed scores. Do the same for observed scores that are smaller than 80 or larger than 120. IQ_data<-cbind(observed_score1,observed_score2,error_score1,error_score2,true_score) True_score_est <- mean(IQ_data[,1])+0.8*(IQ_data[,1]-mean(IQ_data[,1])) diff_from_true_for_est<-abs(IQ_data[,5]-True_score_est) mean(abs(IQ_data[,3])) mean(diff_from_true_for_est) IQ_data_extreme 120 | IQ_data[,1] < 80) True_score_est_extreme <- mean(IQ_data[,1])+0.8*(IQ_data_extreme[,1]-mean(IQ_data[,1])) diff_from_true_for_est_extreme<-abs(IQ_data_extreme[,5]-True_score_est_extreme) mean(abs(IQ_data_extreme[,3])) mean(diff_from_true_for_est_extreme) # Compute IQ scores (mean=100 or 85, SD=15, reliability=0.80) for 10,000 blacks and 10,000 whites. set.seed(243) W_true_score <- round(rnorm(n=10000,m=100,sd=13.42),digits=0) W_error_score <- round(rnorm(n=10000,m=0,sd=6.71),digits=1) W_observed_score <- W_true_score + W_error_score B_true_score <- round(rnorm(n=10000,m=85,sd=13.42),digits=0) B_error_score <- round(rnorm(n=10000,m=0,sd=6.71),digits=1) B_observed_score <- B_true_score + B_error_score # Regress observed scores on true scores and vice versa for blacks and whites and plot the results. regr_whites1 <- lm(W_observed_score~W_true_score) regr_blacks1 <- lm(B_observed_score~B_true_score) summary(regr_whites1) summary(regr_blacks1) axis_labels<-c(40,50,60,70,80,90,100,110,120,130,140,150,160) plot(W_true_score,W_observed_score,main="Black and white observed scores regressed on true scores\nCyan/red=whites, Gray/black=blacks\nN=20,000", xlab="True score", ylab="Observed score",pch=0, col="cyan",xlim=c(40,150),ylim=c(40,150)) points(B_true_score,B_observed_score,pch=0, col="slategrey") abline(regr_whites1,col="red") abline(regr_blacks1,col="black") axis(2,at=axis_labels, labels=axis_labels) axis(1,at=axis_labels, labels=axis_labels) regr_whites2 <- lm(W_true_score~W_observed_score) regr_blacks2 <- lm(B_true_score~B_observed_score) plot(W_observed_score,W_true_score,main="Black and white true scores regressed on observed scores\nCyan/red=whites, Gray/black=blacks\nN=20,000", xlab="Observed score", ylab="True score",pch=0, col="cyan",xlim=c(40,160),ylim=c(40,150)) points(B_observed_score,B_true_score,pch=0, col="slategrey") abline(regr_whites2,col="red") abline(regr_blacks2,col="black") axis(2,at=axis_labels, labels=axis_labels) axis(1,at=axis_labels, labels=axis_labels) summary(regr_whites2) summary(regr_blacks2) # Calculate estimated true scores for blacks and whites, regress true scores on them, and plot the results. W_true_score_est <- 100+0.8*(W_observed_score-100) B_true_score_est <- 85+0.8*(B_observed_score-85) regr_whites3 <- lm(W_true_score~W_true_score_est) regr_blacks3 <- lm(B_true_score~B_true_score_est) plot(W_true_score_est,W_true_score,main="Black and white true scores regressed on estimated true scores\nCyan/red=whites, Gray/black=blacks\nN=20,000", xlab="Estimated true score", ylab="True score",pch=0, col="cyan",xlim=c(40,160),ylim=c(40,150)) points(B_true_score_est,B_true_score,pch=0, col="slategrey") abline(regr_whites3,col="red") abline(regr_blacks3,col="black") axis(2,at=axis_labels, labels=axis_labels) axis(1,at=axis_labels, labels=axis_labels) summary(regr_whites3) summary(regr_blacks3) # Generate five z-scores with specified intercorrelations representing wage, hours worked, work experience, field of employment and occupational level. Z-score wages are then transformed to log wages for both sexes and plotted. set.seed(50) install.packages("MASS") library(MASS) corr_wage=sqrt(0.125) wage_data_men <- mvrnorm(10000, mu = c(0,0,0,0,0), Sigma = matrix(c(1,corr_wage,corr_wage,corr_wage,corr_wage,corr_wage,1,0,0,0,corr_wage,0,1,0,0,corr_wage,0,0,1,0,corr_wage,0,0,0,1), ncol = 5),empirical = TRUE) wage_data_men[,1] <- wage_data_men[,1]*0.7+6.91 wage_data_men<- cbind(rep(0, 10000),wage_data_men) wage_data_women <- mvrnorm(10000, mu = c(0,0,0,0,0), Sigma = matrix(c(1,corr_wage,corr_wage,corr_wage,corr_wage,corr_wage,1,0,0,0,corr_wage,0,1,0,0,corr_wage,0,0,1,0,corr_wage,0,0,0,1), ncol = 5),empirical = TRUE) wage_data_women[,1] <- wage_data_women[,1]*0.7+6.66 wage_data_women[,2] <- wage_data_women[,2]-0.25 wage_data_women[,3] <- wage_data_women[,3]-0.25 wage_data_women[,4] <- wage_data_women[,4]-0.25 wage_data_women[,5] <- wage_data_women[,5]-0.25 wage_data_women<- cbind(matrix(1, 10000, 1),wage_data_women) wage_data <- rbind(wage_data_men,wage_data_women) colnames(wage_data)<-c("sex","log_wage","true_hours_worked","true_work_experience","true_field_of_employment","true_occupational_level") hist(wage_data_women[,2], col=rgb(1,0,0,0.5), breaks=50, main="Histogram of log weekly wages\nWomen=red, men=blue, overlapping=purple",xlab="Log weekly wage",xlim=c(4,10)) hist(wage_data_men[,2], breaks=50,col=rgb(0,0,1,0.5), xlim=c(4,10), add=T) # Regress log wages on sex and the true score versions of the explanatory variables and print out the results using the sjPlot package. sex <- wage_data[,1] log_wage <- wage_data[,2] true_hours_worked <- wage_data[,3] true_work_experience <- wage_data[,4] true_field_of_employment <- wage_data[,5] true_occupational_level <- wage_data[,6] regr_wages_true_scores=lm(log_wage ~ true_hours_worked + true_work_experience + true_field_of_employment + true_occupational_level+sex) install.packages("sjPlot") library(sjPlot) sjt.lm(regr_wages_true_scores) # Create error-laden (reliability=0.80) versions of the explanatory variables, regress wages on them and sex, and print out the results. set.seed(46) observed_hours_worked <- true_hours_worked+rnorm(n=20000,m=0,sd=sqrt(0.25)) observed_work_experience <- true_work_experience+rnorm(n=20000,m=0,sd=sqrt(0.25)) observed_field_of_employment <- true_field_of_employment+rnorm(n=20000,m=0,sd=sqrt(0.25)) observed_occupational_level <- true_occupational_level+rnorm(n=20000,m=0,sd=sqrt(0.25)) wage_data <- cbind(wage_data,observed_hours_worked) wage_data <- cbind(wage_data,observed_work_experience) wage_data <- cbind(wage_data,observed_field_of_employment) wage_data <- cbind(wage_data,observed_occupational_level) regr_wages_observed_scores <- lm(log_wage ~ observed_hours_worked + observed_work_experience+observed_field_of_employment + observed_occupational_level+sex) sjt.lm(regr_wages_observed_scores) # Compute estimated true scores for the four explanatory variables using gender-specific mean values. Regress wages on these true score estimates, and print out the results. est_true_hours <- wage_data[,1]*-0.25+0.8*(observed_hours_worked-wage_data[,1]*-0.25) est_true_experience <- wage_data[,1]*-0.25+0.8*(observed_work_experience-wage_data[,1]*-0.25) est_true_field <- wage_data[,1]*-0.25+0.8*(observed_field_of_employment-wage_data[,1]*-0.25) est_true_occ_level <- wage_data[,1]*-0.25+0.8*(observed_occupational_level-wage_data[,1]*-0.25) regr_wages_est_true_scores <- lm(log_wage ~ est_true_hours + est_true_experience+est_true_field + est_true_occ_level+sex) sjt.lm(regr_wages_est_true_scores)