Example: Tim Churches

The following code is modeled after an illustration designed for RPy done by Tim Churches (RPy website).  The code analyzes data on the Old Faithful geyser in Yellowstone National Park.  The code illustrates the ease and power of RinRuby and its methods.  The RinRuby code is shown on the left and the original RPy code on the right.

 
require "rinruby"

faithful_data = {"eruption_duration"=>[], "waiting_time"=>[]}


for row in File.readlines('faithful.dat')[1..-1]
  splitrow = row.chomp.split
  faithful_data["eruption_duration"] << splitrow[0].to_f
  faithful_data["waiting_time"] << splitrow[1].to_i
end


R.ed = faithful_data["eruption_duration"]
R.eval "edsummary <- summary(ed)"
edsummary = R.pull("as.vector(edsummary)")
keys = R.pull("names(edsummary)")
puts "Summary of Old Faithful eruption duration data"
keys.each_index do |i|
  puts "#{keys[i]}: #{sprintf('%.3f',edsummary[i])}"
end
puts
puts "Stem-and-leaf plot of Old Faithful eruption duration data"
R.eval "stem(ed)"

R.eval <<EOF
  png("faithful_histogram.png",width=10,height=7.5)
  hist(ed,seq(1.6, 5.2, 0.2), prob=1,col="lightgreen",
      main="Old Faithful eruptions",xlab="Eruption duration (seconds)")
  lines(density(ed,bw=0.1),col="orange")
  rug(ed)
  dev.off()
EOF

cutoff = 3
R.long_ed = R.ed.delete_if{ |x| x <= cutoff }
R.eval <<EOF
 png('faithful_ecdf.png',width=10,height=7.5)
 # library(stepfun)           # package has been merged into 'stats'
 plot(ecdf(long_ed), do.points=0, verticals=1,
      main=paste('Empirical cumulative distribution function of Old',
                 ' Faithful eruptions longer than #{cutoff} seconds'))
 x <- seq(3,5.4,0.01)
 lines(seq(3,5.4,0.01),pnorm(seq(3,5.4,0.01),mean=mean(long_ed),
       sd=sqrt(var(long_ed))), lty=3, lwd=2, col='red')
 dev.off()

 png('faithful_qq.png',width=10,height=7.5)
 par(pty="s")
 qqnorm(long_ed,col="blue")
 qqline(long_ed,col="red")
 dev.off()
EOF

# R.eval "library('ctest')"   # package has been merged into 'stats'
puts
puts "Shapiro-Wilks normality test of Old Faithful eruptions" +
     " longer than #{cutoff} seconds"
R.eval "sw <- shapiro.test(long_ed)"
puts "W = #{sprintf("%.4f",R.pull("sw$statistic"))}"
puts "p-value = #{sprintf("%.5f",R.pull("sw$p.value"))}"

puts
puts "One-sample Kolmogorov-Smirnov test of Old Faithful eruptions" +
     " longer than #{cutoff} seconds"
R.eval "ks <- ks.test(long_ed,'pnorm',mean=mean(long_ed),"+
                     "sd=sqrt(var(long_ed)))"
puts "D = #{sprintf("%.4f",R.pull("ks$statistic"))}"
puts "p-value = #{sprintf("%.4f",R.pull("ks$p.value"))}"
puts "Alternative hypothesis: #{R.pull("ks$alternative")}"
puts
 
from rpy import *

faithful_data = {"eruption_duration":[],
                 "waiting_time":[]}

f = open('faithful.dat','r')

for row in f.readlines()[1:]: # skip the column header line
    splitrow = row[:-1].split(" ")
    faithful_data["eruption_duration"].append(float(splitrow[0]))
    faithful_data["waiting_time"].append(int(splitrow[1]))

f.close()

ed = faithful_data["eruption_duration"]
edsummary = r.summary(ed)
print "Summary of Old Faithful eruption duration data"
for k in edsummary.keys():
    print k + ": %.3f" % edsummary[k]
print   
print "Stem-and-leaf plot of Old Faithful eruption duration data"
print r.stem(ed)


r.png('faithful_histogram.png',width=733,height=550)
r.hist(ed,r.seq(1.6, 5.2, 0.2), prob=1,col="lightgreen",
       main="Old Faithful eruptions",xlab="Eruption duration (seconds)")
r.lines(r.density(ed,bw=0.1),col="orange")
r.rug(ed)
r.dev_off()



long_ed = filter(lambda x: x > 3, ed)
r.png('faithful_ecdf.png',width=733,height=550)
r.library('stepfun')
r.plot(r.ecdf(long_ed), do_points=0, verticals=1, col="blue",
       main=paste("Empirical cumulative distribution function",
                  " of Old Faithful eruptions longer than 3 seconds")
x = r.seq(3,5.4,0.01)
r.lines(r.seq(3,5.4,0.01),r.pnorm(r.seq(3,5.4,0.01),mean=r.mean(long_ed),
        sd=r.sqrt(r.var(long_ed))), lty=3, lwd=2, col="red")
r.dev_off()



r.png('faithful_qq.png',width=733,height=550)
r.par(pty="s")
r.qqnorm(long_ed,col="blue")
r.qqline(long_ed,col="red")
r.dev_off()


r.library('ctest')
print
print("Shapiro-Wilks normality test of Old Faithful eruptions" +\
      " longer than 3 seconds")
sw = r.shapiro_test(long_ed)
print "W = %.4f" % sw['statistic']['W']
print "p-value = %.5f" % sw['p.value']

print
print("One-sample Kolmogorov-Smirnov test of Old Faithful eruptions" +\
      " longer than 3 seconds"
ks = r.ks_test(long_ed,"pnorm", mean=r.mean(long_ed),
               sd=r.sqrt(r.var(long_ed)))
print "D = %.4f" % ks['statistic']['D']
print "p-value = %.4f" % ks['p.value']
print "Alternative hypothesis: %s" % ks['alternative']
print


Output:


The output from both scripts is the same, except that RinRuby maintains the order of the summary statistics while RPy changes the order.  The output shown below is from RinRuby.

Summary of Old Faithful eruption duration data
Min.: 1.600
1st Qu.: 2.163
Median: 4.000
Mean: 3.488
3rd Qu.: 4.454
Max.: 5.100

Stem-and-leaf plot of Old Faithful eruption duration data

  The decimal point is 1 digit(s) to the left of the |

  16 | 070355555588
  18 | 000022233333335577777777888822335777888
  20 | 00002223378800035778
  22 | 0002335578023578
  24 | 00228
  26 | 23
  28 | 080
  30 | 7
  32 | 2337
  34 | 250077
  36 | 0000823577
  38 | 2333335582225577
  40 | 0000003357788888002233555577778
  42 | 03335555778800233333555577778
  44 | 02222335557780000000023333357778888
  46 | 0000233357700000023578
  48 | 00000022335800333
  50 | 0370

Shapiro-Wilks normality test of Old Faithful eruptions longer than 3 seconds
W = 0.9793
p-value = 0.01052

One-sample Kolmogorov-Smirnov test of Old Faithful eruptions longer than 3 seconds
D = 0.0661
p-value = 0.4284
Alternative hypothesis: two-sided