|
Welcome,
Guest
|
First R script
(1 viewing) (1) Guest
|
TOPIC: First R script
First R script 11 years, 7 months ago #407
|
#Based on Andreadis, I. (2014) Data Quality and Data Cleaning in
#Garzia, D. Marschall, S. (eds) Matching Voters with Parties and Candidates.
#Voting Advice Applications in Comparative Perspective,
#ECPR Press, ISBN: 9781907301735
#http://www.polres.gr/en/sites/default/files/VAA-Book-Ch6.pdf
#Download the files from http://doi.org/10.3886/E17816V3 and
#save them to your working directory, e.g. C:/WSPTS before you start
setwd("C:/WSPTS")
#nofs is the number of statements
nofs<-31
spssfile<-"hmv201410percent-17956.sav"
lengthfile<-"lengthws-18290.csv"
#We need to read SPSS files
library(foreign)
#Store SPSS file in votematch, ignore some warnings
votematch<-read.spss(spssfile, to.data.frame=TRUE, reencode="UTF-8")
dim(votematch)
names(votematch)
#keep track of the last column
lastc<-dim(votematch)[2]
#Let's review some of the timestamps
votematch$t2[1] #this is the timestamp of the first user answering the second question
format(votematch$t1[1], scientific=FALSE)
head(format(cbind(votematch$t1,votematch$t2),scientific=FALSE))
#timestamp is the number of milliseconds since midnight Jan 1, 1970
ms<-votematch$t1[1]
s<-ms/1000
m<-s/60
h<-m/60
d<-h/24
y<-d/365 #when was this response recorded?
#keep only if votematch$t[i]>0
for (i in 0:nofs) {
condition<-eval(parse(text =paste("votematch$t", i, ">0", sep="")))
votematch<-votematch[condition,]
}
#keep only if votematch$t[i]<votematch$t[i+1]
for (i in 0:(nofs-1)) {
condition<-eval(parse(text =paste("votematch$t", i, "<votematch$t", i+1, sep="")))
votematch<-votematch[condition,]
}
dim(votematch)
#compute item response times, i.e. timestamp differences in seconds:
#votematch$d[i+1]<-(votematch$t[i+1]-votematch$t[i])/1000"
for (i in 0:(nofs-1)) {
eval(parse(text=paste("votematch$d", i+1, "<-(votematch$t", i+1,
"-votematch$t", i, ")/1000", sep="")))
}
summary(votematch$d1) #observe the maximum value!
difcols<-c((lastc+1):(lastc+nofs))
#calculate the median response time for each item
median_times<-sapply(votematch[,(lastc+1):(lastc+nofs)],median)
median_times
#load the file with the length of each statement (mumber of characters without spaces)
nofchars<-read.csv(lengthfile)
chars.time<-data.frame(nofchars, median_times)
plot(chars.time)
#put labels to identify the outliers
text(chars.time$Length, chars.time$median_times,
as.vector(row.names(chars.time)), pos=1,cex=0.8, col="red")
#Linear regression model
fit<-lm(chars.time$median_times~chars.time$Length)
summary(fit)
#Calculate the thresholds
threshold1<-1.4+chars.time$Length/39.375
threshold2<-1.4+chars.time$Length/28.125
threshold1
threshold2
#We need car to use recode
library(car)
#categories of cases
#cats{i}<-recode(votematch$d{i}, "lo:threshold1[i]='Scanning';
#threshold1[i]:threshold2[i]='Skimming';
#threshold2[i]:hi='Rauding'", as.factor.result=TRUE)
for (i in 1:31) {
eval(parse(text=paste("votematch$cats", i, "<-recode(votematch$d", i, ", \"lo:",
threshold1[i], "=1;", threshold1[i], ":" , threshold2[i],
"=2;", threshold2[i], ":hi=3\", as.factor.result=FALSE)",
sep="")))
}
#Frequency of scannning, skimming and rauding for one of the statements (e.g 22)
table(votematch$cats22)
catcols<-c((lastc+nofs+1):(lastc+2*nofs))
#get the categories (i.e. 1=scanning, 2=skimming, 3=rauding)
cats<-votematch[,catcols]
#is the respondent scanning? T/F
fast<-cats<2
#How many fast responses?
sumfast<-rowSums(fast)
#You can be flexible on the rule you use to drop cases,
#e.g. you can use as in the chapter: "at least half are scanning or skimming"
#or you can have a more strict rule that will cut less cases: "at least half are scanning"
#or keep only if the respondent was scanning <11 times (less than one third)
length(sumfast[sumfast>10])/length(sumfast) #percentage of dropped cases
cleandata<-votematch[sumfast<11,]
dim(cleandata)
# keep cases where other variables have valid values
table(cleandata$sex)
cleandata<-cleandata[cleandata$sex!="Missing",]
table(cleandata$age)
cleandata<-cleandata[cleandata$age!="Missing",]
table(cleandata$edu)
cleandata<-cleandata[cleandata$edu!="Missing",]
table(cleandata$interest)
cleandata<-cleandata[cleandata$interest!="Missing",]
table(cleandata$device_type_n)
cleandata<-cleandata[cleandata$device_type_n!="Mobile Device",]
cleandata<-cleandata[cleandata$device_type_n!="unknown",]
prop.table(table(cleandata$device_type_n))
# The distribution of response times per item is strongly skewed to the right
summary(cleandata$d2)
hist(cleandata$d2)
boxplot(cleandata$d2)
hist(log(cleandata$d2))
boxplot(log(cleandata$d2))
# The distribution of response times per user?
#Let's observe the first user
i<-1
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))
#Let's observe the 60th user
i<-60
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))
1229/60
#Exploratory data analysis
#let's see the first 100
out1<-rep(0,100)
for (i in 1:100) {
outlier<-length(boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)$out)
if(outlier>0) out1[i]<-1
}
# this would cut to many cases;
sum(out1)
which(out1==1)
#Let's observe the 10th user
i<-10
summary(as.numeric(cleandata[i,difcols]))
hist(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))
summary(log(as.numeric(cleandata[i,difcols])))
hist(log(as.numeric(cleandata[i,difcols]))
boxplot.stats(log(as.numeric(cleandata[i,difcols])))
boxplot.stats(as.numeric(cleandata[i,difcols]), coef=3)
boxplot(as.numeric(cleandata[i,difcols]))
#Exploratory data analysis after using the log function
ex<-c(10,100,1000)
log(ex)
#this would cut less cases; let's see the first 100
out1<-rep(0,100)
for (i in 1:100) {
outlier<-length(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out)
if(outlier>0) out1[i]<-1
}
sum(out1)
which(out1==1)
dim(cleandata)[1]
# Define all extreme values as missing
for (i in 1:dim(cleandata)[1]) {
outlier<-length(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out)
if(outlier>0) {
colsi<-which(log(cleandata[i,difcols])>=
min(boxplot.stats(log(as.numeric(cleandata[i,difcols])), coef=3)$out))
nacols<-difcols[1]-1+colsi
cleandata[i, nacols]<-NA
}
}
cleandata[1:11,difcols]
model<-lm(d2~sex+edu+age+device_type_n, data=cleandata)
summary(model) |
|
Last Edit: 11 years, 7 months ago by Ioannis Andreadis.
|
Re: First R script 11 years, 7 months ago #408
|
# Here's some R Code w.r.t the remarks
# on indexing and vectorization that I made after the last lab session # also benchmarking shows avoiding loops in favor of indexing can # speed up things quite a bit. # also use the scipen options if you want to avoid exponential # notation of large numbers options(scipen = 4) library(foreign) votematch<-read.spss("ioannis.sav", to.data.frame=TRUE, reencode="UTF-8") dim(votematch) names(votematch) str(votematch) votematch$t1[1] votematch2 <- votematch library(microbenchmark) microbenchmark({relevant_colums <- grep("^t",names(votematch2),value=T) relevant_df <- votematch2[,relevant_colums] relevant_df[relevant_df <= 0] <- NA # replace the relevant part of votematch2 with the relevant # that contains NAs votematch2[,relevant_colums] <- relevant_df # run the the na.omit over the entire data.frame relevant_df_clean <- na.omit(votematch2) }) |
|
|
|
Time to create page: 0.23 seconds


