#install.packages("rvest") library(rvest) library(reshape2) library(XML) library(parallel) quebec_html <- html("http://quebec.compareschoolrankings.org/secondary/SchoolsByRankLocationName.aspx") noms_colonnes <- c("Rang 2013-14","Rang des 5 dernières années","Tendance","School Name","City","Cote globale 2013-14","Cote globale moyenne des 5 dernières années") quebec_raw <- (quebec_html %>% html_node(css="table.rating") %>% html_table(fill=TRUE,header=TRUE))[-1,1:7] %>% `colnames<-`(noms_colonnes) quebec_raw$url <- paste0("http://quebec.compareschoolrankings.org",quebec_html %>% html_nodes(css=".tdcell a") %>% html_attr("href")) # School level query school_query <- function(url) { ## get html from page ecole_html <- html(url) ## School information school_info <- (ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_SchoolProperties1_tblProps") %>% html_table(trim=TRUE,dec=","))[-1,2] nombre_eleves <- school_info[1] pct_retard <- school_info[2] pct_ehdaa <- school_info[3] ## Academic performance academic_perf <- ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_tblReportCard") %>% html_table(header=TRUE,dec=",") %>% as.data.frame %>% melt(id.vars = c("Academic Performance"), variable.name = "annee", value.name = "rating") m <- regexpr("[0-9]+,[0-9]*",academic_perf$rating) rating <- length(m) %>% numeric rating <- NA rating[m!=-1] <- gsub(",",".",academic_perf$rating %>% regmatches(m)) %>% as.numeric() academic_perf$rating <- rating ## School coordinates school_coordinates <- (ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_SchoolInfoDisplay") %>% html_children())[c(1,3,5,7,9,12)] school_city_prov_cp <- school_coordinates[4]$text %>% xmlValue %>% as.character %>% strsplit(", ") %>% lapply(strsplit," ") %>% unlist school_name <- school_coordinates[1]$strong %>% html_text() school_status <- school_coordinates[2]$text %>% xmlValue %>% as.character school_address <- school_coordinates[3]$text %>% xmlValue %>% as.character school_city <- school_city_prov_cp[1] school_prov <- school_city_prov_cp[2] school_fsa <- school_city_prov_cp[3] school_ldu <- school_city_prov_cp[4] ## School website pointer_website <- ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_hlSchoolWebsite") school_website <- ifelse(is.null(pointer_website),"", html_attr(pointer_website,"href")) ## return a list academic_perf$school_name = school_name academic_perf$school_status = school_status academic_perf$school_address = school_address academic_perf$school_city = school_city academic_perf$school_prov = school_prov academic_perf$school_fsa = school_fsa academic_perf$school_ldu = school_ldu academic_perf$school_website = school_website academic_perf$nombre_eleves = nombre_eleves academic_perf$pct_retard = pct_retard academic_perf$pct_ehdaa = pct_ehdaa save(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.RData")) write.csv2(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.csv")) } for (i in 1:length(quebec_raw$url)) { print(i) try(school_query(quebec_raw$url[i])) }