Copyright Information

/* Copyright © 2010 Adam Wolenc
 *    Copying and distribution of this file, with or without modification,
 *    are permitted in any medium without royalty provided the copyright
 *    notice and this notice are preserved.  This file is offered as-is,
 *    without any warranty.
 */
Either "Adam Wolenc" or "@adamuu" or both are valid for attribution purposes. Enjoy!

Execution Steps

  1. adamw@chessboard:~/vote$ ./get_vote_data.sh
  2. Wait for all background processes to complete (use ps).
  3. adamw@chessboard:~/vote$ ./rearrange_all_data.pl > vote_table.txt
  4. adamw@chessboard:~/vote$ R --vanilla < R.in
  5. adamw@chessboard:~/vote$ ./get_vote_data_senate.sh
  6. Wait for all background processes to complete (use ps).
  7. adamw@chessboard:~/vote$ ./rearrange_all_data_senate.pl > senate_vote_table.txt
  8. adamw@chessboard:~/vote$ R --vanilla < sen_R.in

Source Code of get_vote_data.sh


#!/bin/bash

#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

#This program downloads data related to role call votes for the current
# house session. Sources of data are thomas.loc.gov and house.gov.

#Warning. This program will generate as many as 2000 http requests and
# create as many as 2000 txt files in the working directory. Also, calls
# to ./rearrange_vote_data.pl are executed in the background.
# Completion of script does not imply completion of processing.

get_votes() {
if [ -
z $1 ] || [ -z $2 ] || [ -z $3 ]; then
echo "Function requires three parameters."
return 0
else
FILE=$1
BASE
=$2
YEAR
=$3
for b in `grep -o "http://.*rollnumber=[0-9]\+" $FILE`; do
VNUM=`expr "$b" : '.*=\([0-9]\+\)'`
VNUM=`printf %03d $VNUM`
#skip this process if the output file already exists (enables resume of interrupted runs)
if [ ! -e "$YEAR.$VNUM.txt" ]; then
#get roll call xml page and parse into tab table
wget -q -F $BASE/roll$VNUM.xml
echo "Creating $YEAR.$VNUM.txt"
#note: background process.
bash -c "cat roll$VNUM.xml | ./rearrange_vote_data.pl $YEAR.$VNUM > $YEAR.$VNUM.txt ; rm roll$VNUM.xml" &
fi
done
fi
}

#cleanp from any previously interrupted runs
rm index* ROLL* roll*

#get links to current session from thomas
wget -q http://thomas.loc.gov/home/rollcallvotes.html

#get first two links to house.gov
for a in `grep -A3 '<h2>House</h2>' rollcallvotes.html | grep -o "http.[^\"]*"`; do
wget -q $a

YEAR
=`grep -o '([0-9]\+)' index.asp`
YEAR=${YEAR:1:4}
BASE=${a%%index.asp}

#links to roll call index pages at the bottom
for c in `grep -o "ROLL_[^\"]*.asp" index.asp | xargs -n1 printf "$BASE%s\n"`; do
wget -q $c
RNUM
=`expr "$c" : '.*_\([0-9]\+\).asp'`
echo
"Getting votes from ROLL_$RNUM.asp of $YEAR"
get_votes ROLL_$RNUM.asp $BASE $YEAR
rm ROLL_$RNUM
.asp
done

rm index
.asp
done

rm rollcallvotes
.html

Source Code of rearrange_vote_data.pl


#!/usr/bin/perl

#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

#This program parses one roll call vote xml file from house.gov and
# produces a tab separated table of voters and votes. +1 means Yea,
# -1 means Nay, and 0 means No Vote.

use XML::Simple;
use
strict;

my $code = shift @ARGV;
my $ref = XMLin('-');
for
my $vote (@{$ref->{'vote-data'}->{'recorded-vote'}}) {
my $vnum = 0;
$vnum = 1 if ($vote->{'vote'} eq 'Yea');
$vnum = -1 if ($vote->{'vote'} eq 'Nay');

print
join("\t", $code,
$vote->{'legislator'}->{'name-id'},
$vote->{'legislator'}->{'content'},
$vote->{'legislator'}->{'party'},
$vote->{'legislator'}->{'state'},
$vnum) . "\n";
}

Source Code of rearrange_all_data.pl


#!/usr/bin/perl

#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

#This program processes all individual .txt files generated by
# rearrange_vote_data.pl
# compiles and transposes, producing one massive tab separated table
# with one record per voter and one column per roll call

use strict;

my %vote_ids;
my %voters;
my %db;
foreach
my $file (<2*.txt>) {
open(IN, "<$file");
while (<
IN>) {
#2010.556 A000022 Ackerman D NY 1
chomp;
my ($vote_id, $voter_id, $voter, $party, $state, $vote) = (split "\t")[0,1,2,3,4,5];
$voter =~ s/s(.*)//; # remove state if it's there
$voter ="$voter ($party-$state)"; #unconditionally add party and state
$voter =~ s/^ +//;
$voter =~ s/ +$//;
$voter =~ tr/ //d;

#maintain voter id map
$voters{$voter_id} = $voter; # overwrites with most current,
#(assuming txt files are sorted)
#maintain vote list
$vote_ids{$vote_id}=1;

#maintain db
$db{$voter_id}->{$vote_id} = $vote;

}
close IN;
}

my @vote_id_arr = sort keys %vote_ids;
print
STDERR scalar @vote_id_arr . " votes.\n";

#header row
for my $vote_id (@vote_id_arr) {
print
"\t$vote_id";
}
print
"\n";

#data
for my $voter_id (keys %db) {
my $voter = $voters{$voter_id};
print
STDERR "$voter ";
print
"$voter";
for
my $vote_id (@vote_id_arr) {
if (!
defined $db{$voter_id}->{$vote_id}) {
print
"\tNA";
} else {
print
"\t" . $db{$voter_id}->{$vote_id};
}
}
print
"\n";
}
print
STDERR "\n";

Source Code of R.in


#Copyright © 2010 Adam Wolenc     
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

data<-read.table('vote_table.txt',na.strings="NA",header=TRUE)

#remove votes and congressman with large number of NA
datat<-data
datat
<-datat[, colSums(is.na(datat)) < 20 ]
datat<-datat[ rowSums(is.na(datat)) < 500, ]

#remove votes where everyone voted the same way
datat<-datat[ , apply(datat, 2, FUN=min, na.rm=TRUE) == -1]
datat<-datat[ , apply(datat, 2, FUN=max, na.rm=TRUE) == 1]
dim(data)
dim(datat)

#make hc plot
hc<-hclust(dist(datat))

#create huge plot
png(file="rep_hc.png", width = 3200, height = 2400)
plot(hc, main="Representatives by Roll-Call Voting Pattern", xlab="Representative", sub="", frame.plot=TRUE, cex=0.8, cex.axis=3, cex.main=3, cex.lab=3)
ngroups<-8
groups
<- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "green", "green", "green", "green", "red", "red") )
dev.off()

#create thumbnail plot
png(file="rep_hc_thumb.png", width = 350, height = 350)
plot(hc, main="Representatives by Roll-Call Voting Pattern", xlab="Representative", sub="", frame.plot=TRUE, cex=0.5)
ngroups<-8
groups
<- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "green", "green", "green", "green", "red", "red") )
dev.off()

#export groups
table(groups)
for (
i in 1:ngroups) {
write(names(groups[groups == i]), paste("group", i, ".txt", sep=""))
}

Source Code of get_vote_data_senate.sh


#!/bin/bash

#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

#This program downloads data related to role call votes for the current
# senate session. Sources of data are thomas.loc.gov and house.gov.

#Warning. This program will generate as many as 2000 http requests and
# create as many as 2000 txt files in the working directory. Also, calls
# to ./rearrange_vote_data.pl are executed in the background.
# Completion of script does not imply completion of processing.

#cleanp from any previously interrupted runs
rm index* ROLL* roll*

#get links to current session from thomas
wget -q http://thomas.loc.gov/home/rollcallvotes.html

#get first two links to senate.gov
for a in `grep -A3 '<h2>Senate</h2>' rollcallvotes.html | grep -o "http.[^\"]*"`; do
wget -q $a
BASE
=`expr match "$a" '\(.*gov\)'`

for
b in `grep -ho "/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=[0-9]\+&session=[0-9]\+&vote=[0-9]\+" vote*.htm`; do
CONGRESS=`expr "$b" : '.*congress=\([0-9]\+\).*'`
SESSION=`expr "$b" : '.*session=\([0-9]\+\).*'`
VNUM=`expr "$b" : '.*vote=\([0-9]\+\).*'`
F="vote_${CONGRESS}_${SESSION}_${VNUM}.xml"

wget -q -F "$BASE/legislative/LIS/roll_call_votes/vote$CONGRESS$SESSION/$F"
echo "Creating sen.$CONGRESS.$SESSION.$VNUM.txt"
#note: background process.
bash -c "cat $F | ./rearrange_vote_data_senate.pl $CONGRESS.$SESSION.$VNUM > sen.$CONGRESS.$SESSION.$VNUM.txt ; rm $F" &
done

rm vote
*.htm
done

rm rollcallvotes
.html

Source Code of rearrange_all_data_senate.pl


#!/usr/bin/perl

#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

#This program processes all individual .txt files generated by
# rearrange_vote_data_senate.pl
# compiles and transposes, producing one massive tab separated table
# with one record per voter and one column per roll call

use strict;

my %vote_ids;
my %voters;
my %db;
foreach
my $file (<sen.*.txt>) {
open(IN, "<$file");
while (<
IN>) {
#111.1.00001 S213 Akaka (D-HI) D HI 1
chomp;
my ($vote_id, $voter_id, $voter, $party, $state, $vote) = (split "\t")[0,1,2,3,4,5];
$voter =~ s/s(.*)//; # remove state if it's there
$voter = "$voter ($party-$state)"; #unconditionally add party and state
$voter =~ s/^ +//;
$voter =~ s/ +$//;
$voter =~ tr/ //d;

#maintain voter id map
$voters{$voter_id} = $voter; # overwrites with most current,
#(assuming txt files are sorted)
#maintain vote list
$vote_ids{$vote_id}=1;

#maintain db
$db{$voter_id}->{$vote_id} = $vote;

}
close IN;
}

my @vote_id_arr = sort keys %vote_ids;
print
STDERR scalar @vote_id_arr . " votes.\n";

#header row
for my $vote_id (@vote_id_arr) {
print
"\t$vote_id";
}
print
"\n";

#data
for my $voter_id (keys %db) {
my $voter = $voters{$voter_id};
print
STDERR "$voter ";
print
"$voter";
for
my $vote_id (@vote_id_arr) {
if (!
defined $db{$voter_id}->{$vote_id}) {
print
"\tNA";
} else {
print
"\t" . $db{$voter_id}->{$vote_id};
}
}
print
"\n";
}
print
STDERR "\n";

Source Code of sen_R.in


#Copyright © 2010 Adam Wolenc     
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.

data<-read.table('senate_vote_table.txt',na.strings="NA",header=TRUE)

#remove votes and congressman with large number of NA
datat<-data
datat
<-datat[, colSums(is.na(datat)) < 30 ]
datat<-datat[ rowSums(is.na(datat)) < 300, ]

#remove votes where everyone voted the same way
datat<-datat[ , apply(datat, 2, FUN=min, na.rm=TRUE) == -1]
datat<-datat[ , apply(datat, 2, FUN=max, na.rm=TRUE) == 1]
dim(data)
dim(datat)

#make hc plot
hc<-hclust(dist(datat))

#create huge plot
png(file="sen_hc.png", width = 1600, height = 1200)
plot(hc, main="Senators by Roll-Call Voting Pattern", xlab="Senator", sub="", frame.plot=TRUE, cex=1.0, cex.axis=3, cex.main=3, cex.lab=3)
ngroups<-8
groups
<- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "blue", "red", "red", "green", "green", "green") )
dev.off()

#create thumbnail plot
png(file="sen_hc_thumb.png", width = 350, height = 350)
plot(hc, main="Senators by Roll-Call Voting Pattern", xlab="Senator", sub="", frame.plot=TRUE, cex=0.5)
ngroups<-8
groups
<- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "blue", "red", "red", "green", "green", "green") )
dev.off()

#export groups
table(groups)
for (
i in 1:ngroups) {
write(names(groups[groups == i]), paste("sen_group", i, ".txt", sep=""))
}

Site created by Adam Wolenc