95 lines
3.2 KiB
R
95 lines
3.2 KiB
R
#install.packages("rvest")
|
|
|
|
library(rvest)
|
|
library(reshape2)
|
|
library(XML)
|
|
library(parallel)
|
|
|
|
quebec_html <- html("http://quebec.compareschoolrankings.org/secondary/SchoolsByRankLocationName.aspx")
|
|
noms_colonnes <- c("Rang 2013-14","Rang des 5 dernières années","Tendance","School Name","City","Cote globale 2013-14","Cote globale moyenne des 5 dernières années")
|
|
|
|
quebec_raw <- (quebec_html %>%
|
|
html_node(css="table.rating") %>%
|
|
html_table(fill=TRUE,header=TRUE))[-1,1:7] %>%
|
|
`colnames<-`(noms_colonnes)
|
|
|
|
quebec_raw$url <- paste0("http://quebec.compareschoolrankings.org",quebec_html %>% html_nodes(css=".tdcell a") %>% html_attr("href"))
|
|
|
|
# School level query
|
|
|
|
school_query <- function(url)
|
|
{
|
|
## get html from page
|
|
ecole_html <- html(url)
|
|
|
|
## School information
|
|
school_info <-
|
|
(ecole_html %>%
|
|
html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_SchoolProperties1_tblProps") %>%
|
|
html_table(trim=TRUE,dec=","))[-1,2]
|
|
|
|
nombre_eleves <- school_info[1]
|
|
pct_retard <- school_info[2]
|
|
pct_ehdaa <- school_info[3]
|
|
|
|
## Academic performance
|
|
|
|
academic_perf <- ecole_html %>%
|
|
html_node("#ctl00_ContentPlaceHolder1_detailedReportCard_tblReportCard") %>%
|
|
html_table(header=TRUE,dec=",") %>%
|
|
as.data.frame %>% melt(id.vars = c("Academic Performance"), variable.name = "annee", value.name = "rating")
|
|
|
|
m <- regexpr("[0-9]+,[0-9]*",academic_perf$rating)
|
|
rating <- length(m) %>% numeric
|
|
rating <- NA
|
|
rating[m!=-1] <- gsub(",",".",academic_perf$rating %>% regmatches(m)) %>% as.numeric()
|
|
academic_perf$rating <- rating
|
|
|
|
## School coordinates
|
|
|
|
school_coordinates <- (ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_SchoolInfoDisplay") %>% html_children())[c(1,3,5,7,9,12)]
|
|
|
|
school_city_prov_cp <-
|
|
school_coordinates[4]$text %>%
|
|
xmlValue %>%
|
|
as.character %>%
|
|
strsplit(", ") %>%
|
|
lapply(strsplit," ") %>%
|
|
unlist
|
|
|
|
school_name <- school_coordinates[1]$strong %>% html_text()
|
|
school_status <- school_coordinates[2]$text %>% xmlValue %>% as.character
|
|
school_address <- school_coordinates[3]$text %>% xmlValue %>% as.character
|
|
school_city <- school_city_prov_cp[1]
|
|
school_prov <- school_city_prov_cp[2]
|
|
school_fsa <- school_city_prov_cp[3]
|
|
school_ldu <- school_city_prov_cp[4]
|
|
|
|
## School website
|
|
|
|
pointer_website <- ecole_html %>% html_node("#ctl00_ContentPlaceHolder1_hlSchoolWebsite")
|
|
school_website <- ifelse(is.null(pointer_website),"", html_attr(pointer_website,"href"))
|
|
|
|
## return a list
|
|
|
|
academic_perf$school_name = school_name
|
|
academic_perf$school_status = school_status
|
|
academic_perf$school_address = school_address
|
|
academic_perf$school_city = school_city
|
|
academic_perf$school_prov = school_prov
|
|
academic_perf$school_fsa = school_fsa
|
|
academic_perf$school_ldu = school_ldu
|
|
academic_perf$school_website = school_website
|
|
academic_perf$nombre_eleves = nombre_eleves
|
|
academic_perf$pct_retard = pct_retard
|
|
academic_perf$pct_ehdaa = pct_ehdaa
|
|
|
|
save(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.RData"))
|
|
write.csv2(academic_perf,file=paste0("data/",sub("/","",school_name)," - Data.csv"))
|
|
}
|
|
|
|
for (i in 1:length(quebec_raw$url))
|
|
{
|
|
print(i)
|
|
try(school_query(quebec_raw$url[i]))
|
|
}
|