Warsztaty z R dla średniozaawansowanych na Politechnice Gdańskiej
Strona warsztatu http://grupawp.github.io/pg-Rworkshop-2016/
dplyr.Rmd. 6 pięcioosobowych zespołów..Rmd. 6 pięcioosobowych zespołów..Rmd w jedną stronę HTML.0'--9’) extracted from a collection of Dutch utility maps. 200 patterns per class (for a total of 2,000 patterns) have been digitized in binary images. Wymiary: 2000 X 649library(dplyr)
# devtools::install_github('hadley/dplyr')source("https://bioconductor.org/biocLite.R") 
biocLite('RTCGA.rnaseq')
biocLite('RTCGA.clinical')
biocLite('RTCGA.mutations')The Cancer Genome Atlas (TCGA) is a comprehensive and coordinated effort to accelerate our understanding of the molecular basis of cancer through the application of genome analysis technologies, including large-scale genome sequencing.
library(RTCGA.rnaseq); data(BRCA.rnaseq) # information about genes' expressions
library(RTCGA.mutations); data(BRCA.mutations) # information about genes' mutations
library(RTCGA.clinical); data(BRCA.clinical) # patients' clinical data
BRCA.rnaseq %>%
    select(`TP53|7157`, bcr_patient_barcode) %>%
    # bcr_patient_barcode contains a key to merge patients between various datasets
    rename(TP53 = `TP53|7157`) %>%
    filter(substr(bcr_patient_barcode, 14, 15) == "01" ) %>% 
    # 01 at the 14-15th position tells these are cancer sample
    mutate(bcr_patient_barcode = substr(as.character(bcr_patient_barcode),1,12)) -> 
    # in clinical info bcr_patient_barcode is only of length 12
    BRCA.rnaseq.TP53
BRCA.mutations %>%
    select(Hugo_Symbol, bcr_patient_barcode) %>%
    # Hugo_symbol tells to which gene the row corresponds.
    # Ff the rows exist for a gene, this means there was a mutation 
    # for this patient for this gene.
    filter(nchar(bcr_patient_barcode)==15) %>%
    # sometime there are inproper lengths of this code
    filter(substr(bcr_patient_barcode, 14, 15)=="01") %>%   
    # 01 at the 14-15th position tells these are cancer sample
    filter(Hugo_Symbol == 'PIK3CA') %>%
    # we are interested only in the mutations of PIK3CA
    unique() %>% 
    # sometimes there are few mutations in the same gene
    mutate(bcr_patient_barcode = substr(as.character(bcr_patient_barcode),1,12)) -> 
    # in clinical info bcr_patient_barcode is only of length 12
    BRCA.mutations.PIK3CA
BRCA.clinical %>%
    select(patient.bcr_patient_barcode,
           patient.vital_status, # information whether patient is still alive
           patient.days_to_last_followup, # how many days has patient been observed if he is alive
           patient.days_to_death) %>% # how many days has patient been observed if he has passed away
    mutate(bcr_patient_barcode = toupper(as.character(patient.bcr_patient_barcode))) %>%
    # in clinical datasets the key column is in lower case and with different name
    mutate(status = ifelse(as.character(patient.vital_status) == "dead",1,0),
           times = ifelse( 
                        !is.na(patient.days_to_last_followup),
                        as.numeric(as.character(patient.days_to_last_followup)),
                        as.numeric(as.character(patient.days_to_death))
            )) %>%
    # if the patient does not have a days_to_last_followup time this means
    # he has days_to_death time
    mutate(times = as.numeric(times)) %>%
    filter(!is.na(times)) %>% 
    # sometime patient does not have any time
    filter(times > 0) -> BRCA.clinical.survival
    # sometimes by mistkae patients have non-positive times (few cases)
BRCA.rnaseq.TP53 %>%
    left_join(y = BRCA.mutations.PIK3CA,
                        by = "bcr_patient_barcode") %>%
    left_join(y = BRCA.clinical.survival,
                        by = "bcr_patient_barcode") %>%
    mutate(TP53_HighExpr = ifelse(TP53 >= median(TP53), "1", "0")) %>%
    mutate(PIK3CA_Mut = as.integer(!is.na(Hugo_Symbol))) %>%
    select(times, status, TP53_HighExpr, PIK3CA_Mut)  %>%
    filter(!is.na(status)) -> BRCA.2survfit
dim(BRCA.2survfit)[1] 1040    4BRCA.2survfit %>%
  group_by(TP53_HighExpr, PIK3CA_Mut, status) %>%
  summarise(counts = n()) %>%
  arrange(desc(counts))Source: local data frame [8 x 4]
Groups: TP53_HighExpr, PIK3CA_Mut [4]
  TP53_HighExpr PIK3CA_Mut status counts
          (chr)      (int)  (dbl)  (int)
1             0          0      0    359
2             0          0      1     35
3             0          1      0    118
4             0          1      1      9
5             1          0      0    298
6             1          0      1     47
7             1          1      0    161
8             1          1      1     13BRCA.2survfit %>%
  group_by(TP53_HighExpr, PIK3CA_Mut) %>%
  summarize(min = min(times),
            median = median(times),
            mean = mean(times),
            max = max(times)) -> agr_stats
library(tidyr)
agr_stats %>%
  select(-PIK3CA_Mut) %>%
  top_n(1, max) %>%
  gather(TP53_HighExpr) -> agr_stats_2viz
names(agr_stats_2viz)[2:3] <- c("statistics", "value")
library(ggplot2)
agr_stats_2viz %>%
  ggplot(aes(statistics, value,  group =TP53_HighExpr, col = TP53_HighExpr)) + 
  geom_line() +
  theme_RTCGA()
library(survminer)
library(survival)
fit <- survfit(Surv(times, status) ~ TP53_HighExpr + PIK3CA_Mut,
               data = BRCA.2survfit)
ggsurvplot(fit, theme = theme_RTCGA(), p.val = TRUE, risk.table = TRUE,
           risk.table.y.text.col = TRUE, risk.table.y.text = FALSE)
This dataset represents a set of possible advertisements on Internet pages. The features encode the geometry of the image (if available) as well as phrases occuring in the URL, the image’s URL and alt text, the anchor text, and words occuring near the anchor text. The task is to predict whether an image is an advertisement (“ad”) or not (“nonad”).
Wymiary: 3279 X 1558
ad <- read.csv("~/pg-Rworkshop-2016/dane/ad.data", header=FALSE,stringsAsFactors = FALSE)
ad[ad == "   ?"] <- 0
for(i in 1:ncol(ad)){
  gsub(" ", "", ad[, i]) %>%
    as.character() %>%
    as.numeric() -> ad[, i]
}
ad %>% 
  summarise_each(funs(mean)) %>%
  .[, 1:6]library(RTCGA.rnaseq)
library(dplyr)
BRCA.rnaseq %>%
  mutate(bcr_patient_barcode = substr(bcr_patient_barcode, 14, 14)) -> BRCA.rnaseq.tumor
BRCA.rnaseq.tumor.first<-BRCA.rnaseq.tumor[, 1:1000]
(sum(BRCA.rnaseq.tumor.first$bcr_patient_barcode==0))  #1100 guz [1] 1100(sum(BRCA.rnaseq.tumor.first$bcr_patient_barcode==1))  #112 zdrowy[1] 112library(FSelector)
information.gain(formula =bcr_patient_barcode~., data = BRCA.rnaseq.tumor.first)->wynik.info
wynik.info %>%
  mutate(nazwy = row.names(wynik.info)) %>%
  arrange(desc(attr_importance)) -> wyniki.po.info
(subset<- cutoff.biggest.diff(wynik.info))[1] "ADAMTS5|11096"  "ARHGAP20|57569"(subset)  ##geny o najbardziej wyróżniającym się wskaźniku attr_imprortance[1] "ADAMTS5|11096"  "ARHGAP20|57569"(subset11<-cutoff.k(wynik.info,10)) [1] "ADAMTS5|11096"  "ARHGAP20|57569" "ABCA10|10349"   "ABCA9|10350"    "ALDH1A2|8854"   "ABCA8|10351"    "ADRB2|154"      "ABCA6|23460"   
 [9] "ANKRD29|147463" "AQP7P1|375719" library(Boruta)
invisible(
Boruta_model <- Boruta(as.factor(bcr_patient_barcode)~.,
                       data = BRCA.rnaseq.tumor.first[,1:100], 
                       doTrace =2, ntree = 50)
)
plot(Boruta_model, las=2, xlab="")
# plotImpHistory(Boruta_model)
# getSelectedAttributeslibrary(glmnet)
model_glmnet <- glmnet(x = as.matrix(BRCA.rnaseq.tumor.first[, -1]),
                       y = as.factor(BRCA.rnaseq.tumor.first[,1]),
                       family="binomial",
                       alpha = 0, lambda.min = 1e-4)
nsteps <- 10
b1 <- coef(model_glmnet)[-1, 1:nsteps]
w <- nonzeroCoef(b1)
b1 <- as.matrix(b1[w, ])
matplot(1:nsteps, t(b1), type = "o", pch = 19,
        col = "blue", xlab = "Step",
        ylab = "Coefficients", lty = 1)
title("Lasso")
abline(h = 0, lty = 2)
Statystyka to nie czarna skrzynka, którą strach otworzyć, ale zbiór pomysłowych obserwacji pozwalających na kontrolowaną analizę danych.
To paraphrase provocatively, ’machine learning is statistics minus any checking of models and assumptions’. Brian D. Ripley (about the difference between machine learning and statistics) fortune(50)
Analiza danych to nie tylko klasyczna statystyka z zagadnieniami estymacji i testowania (najczęściej wykładanymi na standardowych kursach statystyki). Znaczny zbiór metod analizy danych nazywany technikami eksploracji danych lub data mining dotyczy zagadnień klasyfikacji, identyfikacji, analizy skupień oraz modelowania złożonych procesów.
To me, that is what statistics is all about. It is gaining insight from data using modelling and visualization. Hadley Wickham
Data mining to szybko rosnąca grupa metod analizy danych rozwijana nie tylko przez statystyków ale głównie przez biologów, genetyków, cybernetyków, informa- tyków, ekonomistów, osoby pracujące nad rozpoznawaniem obrazów, myśli i wiele innych grup zawodowych
Analiza dyskryminacyjna - Klasyfikacja
W wielu dziedzinach potrzebne są metody, potrafiące automatycznie przypisać no- wy obiekt do jednej z wyróżnionych klas. Np. w analizie kredytowej dla firm chcemy przewidzieć czy firma spłaci kredyt czy nie. Celem procesu dyskryminacji (nazywanego też klasyfikacją, uczeniem z nauczy- cielem lub uczeniem z nadzorem) jest zbudowanie reguły, potrafiącej przypisywać możliwie dokładnie nowe obiekty do znanych klas. W przypadku większości metod możliwe jest klasyfikowanie do więcej niż dwie klasy
Do oceny jakości klasyfikatorów przydzadzą się poniższe funkcje
library(ROCR)
tabela <- function(predykcja_klasy, klasy_prawdziwe){
    table(predykcja_klasy, klasy_prawdziwe)
}
procent <- function(t){
    100*sum(diag(t))/sum(t)
}
czulosc <- function(t){
    if(sum(t[2,])==0) return(0) else t[2,2]/(sum(t[2,]))
}
precyzja <- function(t){
    if(sum(t[,2])==0) return(0) else t[2,2]/sum(t[,2])
}
roc <- function(pred_prawdopod, prawdziwe_klasy,...){
    pred <- prediction(pred_prawdopod, prawdziwe_klasy)
    perf <- performance(pred, measure="tpr",x.measure="fpr")
    plot(perf,col="red",...)
    abline(0,1)
    lines(c(0.5,0.5),c(-0.1,1.1),lty=2,col="green")
}
auc <- function(pred_prawdopod, prawdziwe_klasy){
    pred <- prediction(pred_prawdopod, prawdziwe_klasy)
    performance(pred, "auc")@y.values[[1]]
}Do prezentacji zależności pomiędzy zbiorem wyjściowym a uzyskanym klasyfikatorem służy macierz trafności, gdzie korzystamy z notacji:
Dokładność (procent poprawnego dopasowania) - accuracy
\((TP+TN)/(TP+FP+FN+TN)\)
Precyzja - precision (positive predictive value)
\(TP/(TP+FP)\)
Czułóść - recall/sensitivity (true positive rate)
\(TP/(TP+FN)\)
false positive rate
\(FP/(FP+TN)\)
library(e1071)
which(apply(BRCA.rnaseq.tumor.first[1:1000,], MARGIN = 2, sd) < 0.2) -> do_wywalenia
# dopasowanie optymalnych parametrow `cost` i `gamma`
obj <- tune(svm, as.factor(bcr_patient_barcode)~.,
            data=BRCA.rnaseq.tumor.first[1:1000,-do_wywalenia], 
            ranges = list(gamma = 2^(-1:1), cost = 2^(1:4)),
            tunecontrol = tune.control(sampling = "cross")
)
obj$best.parameters  gamma cost
1   0.5    2mod_svm_opt <- svm(as.factor(bcr_patient_barcode)~., 
                   data=BRCA.rnaseq.tumor.first[1:1000,-do_wywalenia],
                   type="C", kernel="radial",
                   gamma=obj$best.parameters[1],
                   cost=obj$best.parameters[2],
                   probability=TRUE)
pred_svm <- predict(mod_svm_opt, BRCA.rnaseq.tumor.first[1001:1212,-do_wywalenia],
                    probability=TRUE)
pred_praw_svm <- attr(pred_svm,"probabilities")[,2]
klasy_pred_svm <- ifelse( pred_praw_svm > 0.5, 1, 0)
tabela(klasy_pred_svm, BRCA.rnaseq.tumor.first[1001:1212,1]) -> tab_svm
tab_svm               klasy_prawdziwe
predykcja_klasy   0   1
              0 201  11# precyzja(tab_svm)
# czulosc(tab_svm)
# procent(tab_svm)
roc(pred_praw_svm, as.integer(BRCA.rnaseq.tumor.first[1001:1212,1]))
model_bayes <- naiveBayes(bcr_patient_barcode~.,
                          data=BRCA.rnaseq.tumor.first[1:1000,-do_wywalenia], laplace=0.2)
bayes_prawd <- predict(model_bayes, newdata=BRCA.rnaseq.tumor.first[1001:1212,-do_wywalenia], 
                       type="raw")[,2]
klasy_bayes_pred <- ifelse( bayes_prawd >0.5, 1, 0)
tabela(klasy_bayes_pred, as.integer(BRCA.rnaseq.tumor.first[1001:1212,1])) -> tab_bayes
tab_bayes               klasy_prawdziwe
predykcja_klasy   0   1
              0 200   0
              1   1  11precyzja(tab_bayes)[1] 1czulosc(tab_bayes)[1] 0.9166667procent(tab_bayes)[1] 99.5283roc(bayes_prawd, as.integer(BRCA.rnaseq.tumor.first[1001:1212,1]))
Session info:
sessionInfo()R version 3.3.0 (2016-05-03)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 14.04.4 LTS
locale:
 [1] LC_CTYPE=pl_PL.UTF-8          LC_NUMERIC=C                  LC_TIME=pl_PL.UTF-8           LC_COLLATE=pl_PL.UTF-8       
 [5] LC_MONETARY=pl_PL.UTF-8       LC_MESSAGES=pl_PL.UTF-8       LC_PAPER=pl_PL.UTF-8          LC_NAME=pl_PL.UTF-8          
 [9] LC_ADDRESS=pl_PL.UTF-8        LC_TELEPHONE=pl_PL.UTF-8      LC_MEASUREMENT=pl_PL.UTF-8    LC_IDENTIFICATION=pl_PL.UTF-8
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] knitr_1.13                   e1071_1.6-7                  ROCR_1.0-7                   gplots_3.0.1                 glmnet_2.0-5                
 [6] foreach_1.4.3                Matrix_1.2-6                 Boruta_5.0.0                 ranger_0.4.0                 FSelector_0.20              
[11] survival_2.39-4              survminer_0.2.1.900          ggplot2_2.1.0                tidyr_0.4.1                  RTCGA.clinical_20151101.2.0 
[16] RTCGA.mutations_20151101.2.0 RTCGA.rnaseq_20151101.2.0    RTCGA_1.2.2                  dplyr_0.4.3                  rmarkdown_0.9.6             
loaded via a namespace (and not attached):
 [1] Rcpp_0.12.5         lattice_0.20-33     RWeka_0.4-27        class_7.3-14        gtools_3.5.0        assertthat_0.1      digest_0.6.9       
 [8] R6_2.1.2            plyr_1.8.4          chron_2.3-47        evaluate_0.9        httr_1.1.0          RWekajars_3.9.0-1   lazyeval_0.1.10    
[15] data.table_1.9.6    gdata_2.17.0        labeling_0.3        devtools_1.11.1     splines_3.3.0       stringr_1.0.0       pander_0.6.0       
[22] munsell_0.4.3       htmltools_0.3.5     gridExtra_2.2.1     codetools_0.2-14    randomForest_4.6-12 XML_3.98-1.4        withr_1.0.1        
[29] bitops_1.0-6        grid_3.3.0          gtable_0.2.0        DBI_0.4-1           magrittr_1.5        formatR_1.4         scales_0.4.0       
[36] KernSmooth_2.23-15  stringi_1.1.1       viridis_0.3.4       ggthemes_3.0.3      xml2_0.1.2          iterators_1.0.8     tools_3.3.0        
[43] entropy_1.2.1       purrr_0.2.1         parallel_3.3.0      yaml_2.1.13         colorspace_1.2-6    caTools_1.17.1      rvest_0.3.1        
[50] memoise_1.0.0       rJava_0.9-8