commit fb1ff96118699c2d7d75afc051c7168b4bf4639b Author: François Pelletier Date: Sat Nov 7 15:54:16 2015 -0500 première version diff --git a/data.zip b/data.zip new file mode 100644 index 0000000..d5283bc Binary files /dev/null and b/data.zip differ diff --git a/schoolrankings.R b/schoolrankings.R new file mode 100644 index 0000000..cefb1e8 --- /dev/null +++ b/schoolrankings.R @@ -0,0 +1,95 @@ +#install.packages("rvest") + +library(rvest) +library(reshape2) +library(XML) +library(parallel) + +quebec_html <- html("http://quebec.compareschoolrankings.org/secondary/SchoolsByRankLocationName.aspx") +noms_colonnes <- c("Rang 2013-14","Rang des 5 dernières années","Tendance","School Name","City","Cote globale 2013-14","Cote globale moyenne des 5 dernières années") + +quebec_raw <- (quebec_html %>% + html_node(css="table.rating") %>% + html_table(fill=TRUE,header=TRUE))[-1,1:7] %>% + `colnames<-`(noms_colonnes) + +quebec_raw$url <- paste0("http://quebec.compareschoolrankings.org",quebec_html %>% html_nodes(css=".tdcell a") %>% html_attr("href")) + +# School level query + +school_query <- function(url) +{ + ## get html from page + ecole_html <- html(url) + + ## School information + school_info <- + (ecole_html %>% + html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_SchoolProperties1_tblProps") %>% + html_table(trim=TRUE,dec=","))[-1,2] + + nombre_eleves <- school_info[1] + pct_retard <- school_info[2] + pct_ehdaa <- school_info[3] + + ## Academic performance + + academic_perf <- ecole_html %>% + html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_tblReportCard") %>% + html_table(header=TRUE,dec=",") %>% + as.data.frame %>% melt(id.vars = c("Academic Performance"), variable.name = "annee", value.name = "rating") + + m <- regexpr("[0-9]+,[0-9]*",academic_perf$rating) + rating <- length(m) %>% numeric + rating <- NA + rating[m!=-1] <- gsub(",",".",academic_perf$rating %>% regmatches(m)) %>% as.numeric() + academic_perf$rating <- rating + + ## School coordinates + + school_coordinates <- (ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_SchoolInfoDisplay") %>% html_children())[c(1,3,5,7,9,12)] + + school_city_prov_cp <- + school_coordinates[4]$text %>% + xmlValue %>% + as.character %>% + strsplit(", ") %>% + lapply(strsplit," ") %>% + unlist + + school_name <- school_coordinates[1]$strong %>% html_text() + school_status <- school_coordinates[2]$text %>% xmlValue %>% as.character + school_address <- school_coordinates[3]$text %>% xmlValue %>% as.character + school_city <- school_city_prov_cp[1] + school_prov <- school_city_prov_cp[2] + school_fsa <- school_city_prov_cp[3] + school_ldu <- school_city_prov_cp[4] + + ## School website + + pointer_website <- ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_hlSchoolWebsite") + school_website <- ifelse(is.null(pointer_website),"", html_attr(pointer_website,"href")) + + ## return a list + + academic_perf$school_name = school_name + academic_perf$school_status = school_status + academic_perf$school_address = school_address + academic_perf$school_city = school_city + academic_perf$school_prov = school_prov + academic_perf$school_fsa = school_fsa + academic_perf$school_ldu = school_ldu + academic_perf$school_website = school_website + academic_perf$nombre_eleves = nombre_eleves + academic_perf$pct_retard = pct_retard + academic_perf$pct_ehdaa = pct_ehdaa + + save(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.RData")) + write.csv2(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.csv")) +} + +for (i in 1:length(quebec_raw$url)) +{ + print(i) + try(school_query(quebec_raw$url[i])) +}