schoolrankings/schoolrankings.R
François Pelletier fb1ff96118 première version
2015-11-07 15:54:16 -05:00

95 lines
3.2 KiB
R

#install.packages("rvest")
library(rvest)
library(reshape2)
library(XML)
library(parallel)
quebec_html <- html("http://quebec.compareschoolrankings.org/secondary/SchoolsByRankLocationName.aspx")
noms_colonnes <- c("Rang 2013-14","Rang des 5 dernières années","Tendance","School Name","City","Cote globale 2013-14","Cote globale moyenne des 5 dernières années")
quebec_raw <- (quebec_html %>%
html_node(css="table.rating") %>%
html_table(fill=TRUE,header=TRUE))[-1,1:7] %>%
`colnames<-`(noms_colonnes)
quebec_raw$url <- paste0("http://quebec.compareschoolrankings.org",quebec_html %>% html_nodes(css=".tdcell a") %>% html_attr("href"))
# School level query
school_query <- function(url)
{
## get html from page
ecole_html <- html(url)
## School information
school_info <-
(ecole_html %>%
html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_SchoolProperties1_tblProps") %>%
html_table(trim=TRUE,dec=","))[-1,2]
nombre_eleves <- school_info[1]
pct_retard <- school_info[2]
pct_ehdaa <- school_info[3]
## Academic performance
academic_perf <- ecole_html %>%
html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_tblReportCard") %>%
html_table(header=TRUE,dec=",") %>%
as.data.frame %>% melt(id.vars = c("Academic Performance"), variable.name = "annee", value.name = "rating")
m <- regexpr("[0-9]+,[0-9]*",academic_perf$rating)
rating <- length(m) %>% numeric
rating <- NA
rating[m!=-1] <- gsub(",",".",academic_perf$rating %>% regmatches(m)) %>% as.numeric()
academic_perf$rating <- rating
## School coordinates
school_coordinates <- (ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_SchoolInfoDisplay") %>% html_children())[c(1,3,5,7,9,12)]
school_city_prov_cp <-
school_coordinates[4]$text %>%
xmlValue %>%
as.character %>%
strsplit(", ") %>%
lapply(strsplit," ") %>%
unlist
school_name <- school_coordinates[1]$strong %>% html_text()
school_status <- school_coordinates[2]$text %>% xmlValue %>% as.character
school_address <- school_coordinates[3]$text %>% xmlValue %>% as.character
school_city <- school_city_prov_cp[1]
school_prov <- school_city_prov_cp[2]
school_fsa <- school_city_prov_cp[3]
school_ldu <- school_city_prov_cp[4]
## School website
pointer_website <- ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_hlSchoolWebsite")
school_website <- ifelse(is.null(pointer_website),"", html_attr(pointer_website,"href"))
## return a list
academic_perf$school_name = school_name
academic_perf$school_status = school_status
academic_perf$school_address = school_address
academic_perf$school_city = school_city
academic_perf$school_prov = school_prov
academic_perf$school_fsa = school_fsa
academic_perf$school_ldu = school_ldu
academic_perf$school_website = school_website
academic_perf$nombre_eleves = nombre_eleves
academic_perf$pct_retard = pct_retard
academic_perf$pct_ehdaa = pct_ehdaa
save(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.RData"))
write.csv2(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.csv"))
}
for (i in 1:length(quebec_raw$url))
{
print(i)
try(school_query(quebec_raw$url[i]))
}