Copyright Information
/* Copyright © 2010 Adam Wolenc
* Copying and distribution of this file, with or without modification,
* are permitted in any medium without royalty provided the copyright
* notice and this notice are preserved. This file is offered as-is,
* without any warranty.
*/
Either "Adam Wolenc" or "@adamuu" or both are valid for attribution purposes. Enjoy!
Execution Steps
- adamw@chessboard:~/vote$ ./get_vote_data.sh
- Wait for all background processes to complete (use ps).
- adamw@chessboard:~/vote$ ./rearrange_all_data.pl > vote_table.txt
- adamw@chessboard:~/vote$ R --vanilla < R.in
- adamw@chessboard:~/vote$ ./get_vote_data_senate.sh
- Wait for all background processes to complete (use ps).
- adamw@chessboard:~/vote$ ./rearrange_all_data_senate.pl > senate_vote_table.txt
- adamw@chessboard:~/vote$ R --vanilla < sen_R.in
Source Code of get_vote_data.sh
#!/bin/bash
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
#This program downloads data related to role call votes for the current
# house session. Sources of data are thomas.loc.gov and house.gov.
#Warning. This program will generate as many as 2000 http requests and
# create as many as 2000 txt files in the working directory. Also, calls
# to ./rearrange_vote_data.pl are executed in the background.
# Completion of script does not imply completion of processing.
get_votes() {
if [ -z $1 ] || [ -z $2 ] || [ -z $3 ]; then
echo "Function requires three parameters."
return 0
else
FILE=$1
BASE=$2
YEAR=$3
for b in `grep -o "http://.*rollnumber=[0-9]\+" $FILE`; do
VNUM=`expr "$b" : '.*=\([0-9]\+\)'`
VNUM=`printf %03d $VNUM`
#skip this process if the output file already exists (enables resume of interrupted runs)
if [ ! -e "$YEAR.$VNUM.txt" ]; then
#get roll call xml page and parse into tab table
wget -q -F $BASE/roll$VNUM.xml
echo "Creating $YEAR.$VNUM.txt"
#note: background process.
bash -c "cat roll$VNUM.xml | ./rearrange_vote_data.pl $YEAR.$VNUM > $YEAR.$VNUM.txt ; rm roll$VNUM.xml" &
fi
done
fi
}
#cleanp from any previously interrupted runs
rm index* ROLL* roll*
#get links to current session from thomas
wget -q http://thomas.loc.gov/home/rollcallvotes.html
#get first two links to house.gov
for a in `grep -A3 '<h2>House</h2>' rollcallvotes.html | grep -o "http.[^\"]*"`; do
wget -q $a
YEAR=`grep -o '([0-9]\+)' index.asp`
YEAR=${YEAR:1:4}
BASE=${a%%index.asp}
#links to roll call index pages at the bottom
for c in `grep -o "ROLL_[^\"]*.asp" index.asp | xargs -n1 printf "$BASE%s\n"`; do
wget -q $c
RNUM=`expr "$c" : '.*_\([0-9]\+\).asp'`
echo "Getting votes from ROLL_$RNUM.asp of $YEAR"
get_votes ROLL_$RNUM.asp $BASE $YEAR
rm ROLL_$RNUM.asp
done
rm index.asp
done
rm rollcallvotes.html
Source Code of rearrange_vote_data.pl
#!/usr/bin/perl
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
#This program parses one roll call vote xml file from house.gov and
# produces a tab separated table of voters and votes. +1 means Yea,
# -1 means Nay, and 0 means No Vote.
use XML::Simple;
use strict;
my $code = shift @ARGV;
my $ref = XMLin('-');
for my $vote (@{$ref->{'vote-data'}->{'recorded-vote'}}) {
my $vnum = 0;
$vnum = 1 if ($vote->{'vote'} eq 'Yea');
$vnum = -1 if ($vote->{'vote'} eq 'Nay');
print join("\t", $code,
$vote->{'legislator'}->{'name-id'},
$vote->{'legislator'}->{'content'},
$vote->{'legislator'}->{'party'},
$vote->{'legislator'}->{'state'},
$vnum) . "\n";
}
Source Code of rearrange_all_data.pl
#!/usr/bin/perl
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
#This program processes all individual .txt files generated by
# rearrange_vote_data.pl
# compiles and transposes, producing one massive tab separated table
# with one record per voter and one column per roll call
use strict;
my %vote_ids;
my %voters;
my %db;
foreach my $file (<2*.txt>) {
open(IN, "<$file");
while (<IN>) {
#2010.556 A000022 Ackerman D NY 1
chomp;
my ($vote_id, $voter_id, $voter, $party, $state, $vote) = (split "\t")[0,1,2,3,4,5];
$voter =~ s/s(.*)//; # remove state if it's there
$voter ="$voter ($party-$state)"; #unconditionally add party and state
$voter =~ s/^ +//;
$voter =~ s/ +$//;
$voter =~ tr/ //d;
#maintain voter id map
$voters{$voter_id} = $voter; # overwrites with most current,
#(assuming txt files are sorted)
#maintain vote list
$vote_ids{$vote_id}=1;
#maintain db
$db{$voter_id}->{$vote_id} = $vote;
}
close IN;
}
my @vote_id_arr = sort keys %vote_ids;
print STDERR scalar @vote_id_arr . " votes.\n";
#header row
for my $vote_id (@vote_id_arr) {
print "\t$vote_id";
}
print "\n";
#data
for my $voter_id (keys %db) {
my $voter = $voters{$voter_id};
print STDERR "$voter ";
print "$voter";
for my $vote_id (@vote_id_arr) {
if (!defined $db{$voter_id}->{$vote_id}) {
print "\tNA";
} else {
print "\t" . $db{$voter_id}->{$vote_id};
}
}
print "\n";
}
print STDERR "\n";
Source Code of R.in
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
data<-read.table('vote_table.txt',na.strings="NA",header=TRUE)
#remove votes and congressman with large number of NA
datat<-data
datat<-datat[, colSums(is.na(datat)) < 20 ]
datat<-datat[ rowSums(is.na(datat)) < 500, ]
#remove votes where everyone voted the same way
datat<-datat[ , apply(datat, 2, FUN=min, na.rm=TRUE) == -1]
datat<-datat[ , apply(datat, 2, FUN=max, na.rm=TRUE) == 1]
dim(data)
dim(datat)
#make hc plot
hc<-hclust(dist(datat))
#create huge plot
png(file="rep_hc.png", width = 3200, height = 2400)
plot(hc, main="Representatives by Roll-Call Voting Pattern", xlab="Representative", sub="", frame.plot=TRUE, cex=0.8, cex.axis=3, cex.main=3, cex.lab=3)
ngroups<-8
groups <- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "green", "green", "green", "green", "red", "red") )
dev.off()
#create thumbnail plot
png(file="rep_hc_thumb.png", width = 350, height = 350)
plot(hc, main="Representatives by Roll-Call Voting Pattern", xlab="Representative", sub="", frame.plot=TRUE, cex=0.5)
ngroups<-8
groups <- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "green", "green", "green", "green", "red", "red") )
dev.off()
#export groups
table(groups)
for (i in 1:ngroups) {
write(names(groups[groups == i]), paste("group", i, ".txt", sep=""))
}
Source Code of get_vote_data_senate.sh
#!/bin/bash
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
#This program downloads data related to role call votes for the current
# senate session. Sources of data are thomas.loc.gov and house.gov.
#Warning. This program will generate as many as 2000 http requests and
# create as many as 2000 txt files in the working directory. Also, calls
# to ./rearrange_vote_data.pl are executed in the background.
# Completion of script does not imply completion of processing.
#cleanp from any previously interrupted runs
rm index* ROLL* roll*
#get links to current session from thomas
wget -q http://thomas.loc.gov/home/rollcallvotes.html
#get first two links to senate.gov
for a in `grep -A3 '<h2>Senate</h2>' rollcallvotes.html | grep -o "http.[^\"]*"`; do
wget -q $a
BASE=`expr match "$a" '\(.*gov\)'`
for b in `grep -ho "/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=[0-9]\+&session=[0-9]\+&vote=[0-9]\+" vote*.htm`; do
CONGRESS=`expr "$b" : '.*congress=\([0-9]\+\).*'`
SESSION=`expr "$b" : '.*session=\([0-9]\+\).*'`
VNUM=`expr "$b" : '.*vote=\([0-9]\+\).*'`
F="vote_${CONGRESS}_${SESSION}_${VNUM}.xml"
wget -q -F "$BASE/legislative/LIS/roll_call_votes/vote$CONGRESS$SESSION/$F"
echo "Creating sen.$CONGRESS.$SESSION.$VNUM.txt"
#note: background process.
bash -c "cat $F | ./rearrange_vote_data_senate.pl $CONGRESS.$SESSION.$VNUM > sen.$CONGRESS.$SESSION.$VNUM.txt ; rm $F" &
done
rm vote*.htm
done
rm rollcallvotes.html
Source Code of rearrange_all_data_senate.pl
#!/usr/bin/perl
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
#This program processes all individual .txt files generated by
# rearrange_vote_data_senate.pl
# compiles and transposes, producing one massive tab separated table
# with one record per voter and one column per roll call
use strict;
my %vote_ids;
my %voters;
my %db;
foreach my $file (<sen.*.txt>) {
open(IN, "<$file");
while (<IN>) {
#111.1.00001 S213 Akaka (D-HI) D HI 1
chomp;
my ($vote_id, $voter_id, $voter, $party, $state, $vote) = (split "\t")[0,1,2,3,4,5];
$voter =~ s/s(.*)//; # remove state if it's there
$voter = "$voter ($party-$state)"; #unconditionally add party and state
$voter =~ s/^ +//;
$voter =~ s/ +$//;
$voter =~ tr/ //d;
#maintain voter id map
$voters{$voter_id} = $voter; # overwrites with most current,
#(assuming txt files are sorted)
#maintain vote list
$vote_ids{$vote_id}=1;
#maintain db
$db{$voter_id}->{$vote_id} = $vote;
}
close IN;
}
my @vote_id_arr = sort keys %vote_ids;
print STDERR scalar @vote_id_arr . " votes.\n";
#header row
for my $vote_id (@vote_id_arr) {
print "\t$vote_id";
}
print "\n";
#data
for my $voter_id (keys %db) {
my $voter = $voters{$voter_id};
print STDERR "$voter ";
print "$voter";
for my $vote_id (@vote_id_arr) {
if (!defined $db{$voter_id}->{$vote_id}) {
print "\tNA";
} else {
print "\t" . $db{$voter_id}->{$vote_id};
}
}
print "\n";
}
print STDERR "\n";
Source Code of sen_R.in
#Copyright © 2010 Adam Wolenc
#Copying and distribution of this file, with or without modification,
#are permitted in any medium without royalty provided the copyright
#notice and this notice are preserved.
data<-read.table('senate_vote_table.txt',na.strings="NA",header=TRUE)
#remove votes and congressman with large number of NA
datat<-data
datat<-datat[, colSums(is.na(datat)) < 30 ]
datat<-datat[ rowSums(is.na(datat)) < 300, ]
#remove votes where everyone voted the same way
datat<-datat[ , apply(datat, 2, FUN=min, na.rm=TRUE) == -1]
datat<-datat[ , apply(datat, 2, FUN=max, na.rm=TRUE) == 1]
dim(data)
dim(datat)
#make hc plot
hc<-hclust(dist(datat))
#create huge plot
png(file="sen_hc.png", width = 1600, height = 1200)
plot(hc, main="Senators by Roll-Call Voting Pattern", xlab="Senator", sub="", frame.plot=TRUE, cex=1.0, cex.axis=3, cex.main=3, cex.lab=3)
ngroups<-8
groups <- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "blue", "red", "red", "green", "green", "green") )
dev.off()
#create thumbnail plot
png(file="sen_hc_thumb.png", width = 350, height = 350)
plot(hc, main="Senators by Roll-Call Voting Pattern", xlab="Senator", sub="", frame.plot=TRUE, cex=0.5)
ngroups<-8
groups <- cutree(hc, k=ngroups)
rect.hclust(hc, k=ngroups, border=c("blue", "blue", "blue", "red", "red", "green", "green", "green") )
dev.off()
#export groups
table(groups)
for (i in 1:ngroups) {
write(names(groups[groups == i]), paste("sen_group", i, ".txt", sep=""))
}
Site created by Adam Wolenc