180 lines
5.1 KiB
Text
180 lines
5.1 KiB
Text
|
---
|
||
|
title: "PipelineViz"
|
||
|
author: "François Pelletier"
|
||
|
date: "26 août 2015"
|
||
|
output:
|
||
|
html_document:
|
||
|
fig_caption: yes
|
||
|
fig_height: 8
|
||
|
fig_width: 10
|
||
|
keep_md: yes
|
||
|
number_sections: yes
|
||
|
toc: yes
|
||
|
---
|
||
|
|
||
|
# Analyse d'un jeu de données ouvertes
|
||
|
|
||
|
Source: [Données sur les évenements de pipeline de Janvier 2004 au présent](http://ouvert.canada.ca/data/fr/dataset/da1be1b4-5e2b-4c9a-ae0e-e8551fd6b265)
|
||
|
|
||
|
## Chargement du dictionnaire de données
|
||
|
|
||
|
```{r}
|
||
|
# install.packages(c("data.table", "dplyr","maps","ggmap"))
|
||
|
options(unzip = 'internal')
|
||
|
library("data.table")
|
||
|
library("dplyr")
|
||
|
library("maps")
|
||
|
library("ggmap")
|
||
|
library("reshape2")
|
||
|
library("lubridate")
|
||
|
library("rpart")
|
||
|
library("rpart.plot")
|
||
|
```
|
||
|
|
||
|
## Lecture des données
|
||
|
|
||
|
```{r}
|
||
|
datadict <- read.csv(file = "dd-20150216.csv",header = TRUE, stringsAsFactors = TRUE, encoding = "latin1") %>%
|
||
|
as.data.table()
|
||
|
donnees <- read.csv(file = "p2015-07-fr.csv",header = TRUE,stringsAsFactors = TRUE) %>%
|
||
|
as.data.table()
|
||
|
|
||
|
class_donnees <- sapply(donnees,class)
|
||
|
character_columns <- names(donnees)[class_donnees == "character"]
|
||
|
num_columns <- names(donnees)[class_donnees != "character"]
|
||
|
|
||
|
donnees_conv <- cbind(donnees[, lapply(.SD, iconv, "utf-8", "latin1"),
|
||
|
.SDcols = character_columns],donnees[,.SD,.SDcols = num_columns])
|
||
|
```
|
||
|
|
||
|
## Geocoder les incidents
|
||
|
|
||
|
```{r, eval=FALSE}
|
||
|
LocationsUnique <- unique(donnees_conv[,.(LOCATION,PROVINCE)])
|
||
|
catLocationsUnique <- unique(paste(LocationsUnique$LOCATION, ifelse(!is.na(LocationsUnique$PROVINCE),LocationsUnique$PROVINCE, ""), "CANADA", sep = " "))
|
||
|
geocodedLocations <- cbind(LocationsUnique,geocode(catLocationsUnique))
|
||
|
save(geocodedLocations, file = "geocodedLocations.Rdata")
|
||
|
```
|
||
|
|
||
|
## Joindre les données géocodées aux données existantes
|
||
|
|
||
|
```{r}
|
||
|
load("geocodedLocations.Rdata")
|
||
|
donnees_conv2 <-as.data.table(merge(donnees_conv,geocodedLocations, by=c("LOCATION","PROVINCE")))
|
||
|
donnees_conv3 <- donnees_conv2[!is.na(PROVINCE) & !is.na(LOCATION)]
|
||
|
```
|
||
|
|
||
|
## Préparation de la carte de fond
|
||
|
|
||
|
```{r, eval=FALSE}
|
||
|
range_lat <- range(donnees_conv3$lat)
|
||
|
range_lon <- range(donnees_conv3$lon)
|
||
|
|
||
|
openstreetmap1 <- get_openstreetmap(bbox = c(left = range_lon[1],
|
||
|
bottom = range_lat[1],
|
||
|
right = range_lon[2],
|
||
|
top = range_lat[2]),
|
||
|
scale = 32500000,
|
||
|
format = c("png"),
|
||
|
crop = TRUE, messaging = FALSE, urlonly = FALSE, color = c("color", "bw"))
|
||
|
|
||
|
save(openstreetmap1, file = "openstreetmap1.Rdata")
|
||
|
```
|
||
|
|
||
|
## Chargement de la carte de fond
|
||
|
|
||
|
```{r}
|
||
|
load("openstreetmap1.Rdata")
|
||
|
```
|
||
|
|
||
|
|
||
|
## Carte des incidents avec déversement
|
||
|
|
||
|
```{r, dev='png'}
|
||
|
donnees_incidents <- donnees_conv3[QUANTITY_LOST > 0]
|
||
|
ggmap(openstreetmap1) +
|
||
|
geom_point(aes(x = lon,
|
||
|
y = lat,
|
||
|
colour = PRODUCT_TYPE,
|
||
|
size = QUANTITY_LOST),
|
||
|
data = donnees_incidents) +
|
||
|
scale_size(range = c(2, 8), name=datadict[datadict$Nom.de.colonne.PODS=="QUANTITY_LOST"][2]) +
|
||
|
scale_colour_discrete(name=datadict[datadict$Nom.de.colonne.PODS=="PRODUCT_TYPE"][2]) +
|
||
|
ggtitle(label = "Incidents avec déversement") +
|
||
|
xlab("Longitude") +
|
||
|
ylab("Latitude")
|
||
|
```
|
||
|
|
||
|
## initiales
|
||
|
|
||
|
Cette fonction extrait les initiales d'un mot
|
||
|
|
||
|
```{r}
|
||
|
initials <- function(str)
|
||
|
{
|
||
|
regex <- '(?<=^|\\s|-)[[:alpha:]]'
|
||
|
ini <- regmatches(str, gregexpr(regex, str, perl=TRUE))
|
||
|
toupper(sapply(ini, paste0, collapse=''))
|
||
|
}
|
||
|
```
|
||
|
|
||
|
## Distribution des accidents et incidents
|
||
|
|
||
|
```{r}
|
||
|
dist_incidents <- donnees_conv3 %>%
|
||
|
select(PROVINCE,OCC_TYPE,PRODUCT_TYPE) %>%
|
||
|
group_by(PROVINCE=factor(initials(PROVINCE)),OCC_TYPE=factor(OCC_TYPE)) %>%
|
||
|
summarise(N = n())
|
||
|
```
|
||
|
|
||
|
```{r, dev='png'}
|
||
|
ggplot(dist_incidents,aes(x=PROVINCE,fill=OCC_TYPE)) +
|
||
|
geom_bar(aes(y=N),stat="identity") +
|
||
|
ggtitle(paste0(datadict[datadict$Nom.de.colonne.PODS=="OCC_TYPE"][2],", par ",datadict[datadict$Nom.de.colonne.PODS=="PROVINCE"][2])) +
|
||
|
scale_fill_discrete(name=datadict[datadict$Nom.de.colonne.PODS=="OCC_TYPE"][2]) +
|
||
|
xlab(datadict[datadict$Nom.de.colonne.PODS=="PROVINCE"][2]) +
|
||
|
ylab("Nombre")
|
||
|
```
|
||
|
|
||
|
## Série chronologique
|
||
|
|
||
|
```{r}
|
||
|
data_serie <- donnees_conv3 %>%
|
||
|
select(PIPELINE_CO,OCC_DATE,QUANTITY_LOST) %>%
|
||
|
mutate(N=1, date_clean = parse_date_time(OCC_DATE, orders = "%Y-%m-%d")) %>%
|
||
|
arrange(PIPELINE_CO,date_clean) %>%
|
||
|
group_by(PIPELINE_CO,date_clean) %>%
|
||
|
summarise(Nsum = sum(N)) %>%
|
||
|
mutate(Ncumsum = cumsum(Nsum))
|
||
|
```
|
||
|
|
||
|
### Nombre d'évènements cumulés
|
||
|
|
||
|
```{r, dev='png'}
|
||
|
ggplot(data_serie,aes(x=date_clean,y=Ncumsum,color=PIPELINE_CO)) +
|
||
|
geom_line(stat="identity") +
|
||
|
xlab("Date") +
|
||
|
ylab("Nombre d'indicents cumulés") +
|
||
|
ggtitle("Nombre d'incidents cumulés, par compagnie")
|
||
|
```
|
||
|
|
||
|
## Arbre de décision
|
||
|
```{r}
|
||
|
tree1 <- rpart(OCC_TYPE~FACILITY_TYPE+PROD_RELEASED_IND+PIPELINE_TYPE_CO+ENVIRO_DAMAGE_IND+PROPERTY_DAMAGE_IND+FIRE_IND+EXPLOSION_IND, data=donnees_conv3, method="class")
|
||
|
summary(tree1)
|
||
|
```
|
||
|
|
||
|
### Affichage de l'arbre de décision
|
||
|
|
||
|
```{r}
|
||
|
rpart.plot::rpart.plot(tree1, main = paste0("Arbre de classification pour ",
|
||
|
datadict[datadict$Nom.de.colonne.PODS=="OCC_TYPE"]$Nom.du.champ))
|
||
|
```
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|