Learning & Reasoning/R

Signal and time series seen from eight miles high cloud

이현봉 2015. 2. 15. 17:35

20, 30대 읽던 글들을 이젠 이해하기가 어렵다. 산만한 정신에 빠른 지름길을 찾으려고만 하니. 

졸업할 때 signal ensemble을 다루는 일을 직업으로 할 기회가 있었는데, 그 길을 가지 않았다.  그 길을 택했으면 지금보다 더 나은 사람이 되었을 것인가는 불분명하지만, 지금보다 훨씬 더 나은 사정에 있었을 것이다. 

이제 그 때보다 능력은 훨씬 못하고, 주위에 동료는 없으나, 나 자신을 위한 것이니 아예 못하기 전 정리를 시작해 보자.

main


Purpose of this exercise : Get to know how multivariate signal/TS can be represented.

Good/Bad component classification based on multiple multivariate time series sensor data
- Bad component data : “Bad_Karma”

Bad_Karma.zip

- Good component data : “Good_Karma”

Good_Karma.zip

1. Read in data

library(stringr)
source("util.R")

files_bad_components <- list.files(path=paste(getwd(), "/", "Bad_Karma", sep=""), full.names=T)

data = data.frame()
for (file in files_bad_components) {
  file.temp <- read.table(file, header=F, sep=",")
  file.temp$id <- str_sub(file, start= -9, end=-5) # (뒤)에서 9번째 부터 (뒤)에서 5번째 character 까지 
  file.temp$label <- "bad"
  data = rbind(data, file.temp)
}

files_good_components <- list.files(path=paste(getwd(), "/", "Good_Karma", sep=""), full.names=T)
for (file in files_good_components) {
  file.temp <- read.table(file, header=F, sep=",")
  file.temp$id <- str_sub(file, start= -8, end=-5) # (뒤)에서 8번째 부터 (뒤)에서 5번째 character 까지 
  file.temp$label <- "good"
  data = rbind(data, file.temp)
}

unique(data$id)   # There are 129 instances of MTS type 
##   [1] "12915" "12916" "12917" "12918" "12936" "12937" "12938" "12939"
##   [9] "12940" "13120" "13121" "13122" "13141" "13142" "13143" "13318"
##  [17] "13319" "13320" "13339" "13340" "13341" "2901"  "2902"  "2903" 
##  [25] "2904"  "2905"  "2906"  "2907"  "2908"  "2909"  "2910"  "2911" 
##  [33] "2912"  "2913"  "2914"  "2919"  "2920"  "2921"  "2922"  "2923" 
##  [41] "2924"  "2925"  "2926"  "2927"  "2928"  "2929"  "2930"  "2931" 
##  [49] "2932"  "2933"  "2934"  "2935"  "2941"  "2942"  "2943"  "3101" 
##  [57] "3102"  "3103"  "3104"  "3105"  "3106"  "3107"  "3108"  "3109" 
##  [65] "3110"  "3111"  "3112"  "3113"  "3114"  "3115"  "3116"  "3117" 
##  [73] "3118"  "3119"  "3123"  "3124"  "3125"  "3126"  "3127"  "3128" 
##  [81] "3129"  "3130"  "3131"  "3132"  "3133"  "3134"  "3135"  "3136" 
##  [89] "3137"  "3138"  "3139"  "3140"  "3301"  "3302"  "3303"  "3304" 
##  [97] "3305"  "3306"  "3307"  "3308"  "3309"  "3310"  "3311"  "3312" 
## [105] "3313"  "3314"  "3315"  "3316"  "3317"  "3321"  "3322"  "3323" 
## [113] "3324"  "3325"  "3326"  "3327"  "3328"  "3329"  "3330"  "3331" 
## [121] "3332"  "3333"  "3334"  "3335"  "3336"  "3337"  "3338"  "3342" 
## [129] "3343"

data loading is done, and let’s do some cleansing

library(plyr)
## Warning: package 'plyr' was built under R version 3.1.2
tt = ddply(data, .(id), nrow)  # how many rows(time indices) are there for each instances
str(tt)
## 'data.frame':    129 obs. of  2 variables:
##  $ id: chr  "12915" "12916" "12917" "12918" ...
##  $ V1: int  103 106 102 93 98 102 106 104 99 100 ...
hist(tt$V1)  # most observation have about 100 data points

head( tt[order(tt$V1, decreasing=T), ], 20 )# MTS instance id=2901 has the longest time series points with 112 
##       id  V1
## 22  2901 112
## 17 13319 108
## 56  3101 108
## 23  2902 107
## 27  2906 107
## 2  12916 106
## 7  12938 106
## 24  2903 106
## 28  2907 106
## 29  2908 106
## 58  3103 106
## 26  2905 105
## 30  2909 105
## 57  3102 105
## 8  12939 104
## 25  2904 104
## 31  2910 104
## 32  2911 104
## 33  2912 104
## 34  2913 104
head( tt[order(tt$V1, decreasing=F), ], 20 ) # instance id=3125 has 3 points. id=13122 has 56. get rid of them
##        id V1
## 77   3125  3
## 12  13122 56
## 14  13142 92
## 4   12918 93
## 11  13121 95
## 20  13340 95
## 114  3325 95
## 115  3326 95
## 116  3327 95
## 118  3329 95
## 121  3332 95
## 125  3336 95
## 128  3342 95
## 119  3330 96
## 21  13341 97
## 54   2942 97
## 129  3343 97
## 5   12936 98
## 53   2941 98
## 80   3128 98
ds = data[data$id != 3125 & data$id != 13122,]   # ds is the new filtered data set 
str(ds)  # 12829 - 3 - 56 = 12770 , OK 
## 'data.frame':    12770 obs. of  23 variables:
##  $ V1   : num  9.18 10.2 11.22 12.24 13.26 ...
##  $ V2   : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ V3   : num  751 751 751 751 751 752 751 752 751 751 ...
##  $ V4   : num  753 753 755 752 755 753 753 753 752 754 ...
##  $ V5   : num  133 134 134 132 132 135 130 132 132 132 ...
##  $ V6   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V7   : int  864 833 828 800 780 1378 2323 2320 2021 2089 ...
##  $ V8   : num  101 100 99 101 100 102 100 100 99 102 ...
##  $ V9   : num  1225 1223 1222 1212 1188 ...
##  $ V10  : int  9467 9425 9475 9493 9428 9417 9407 9409 9478 9425 ...
##  $ V11  : int  8961 8927 8881 8879 8981 8893 8961 8969 8999 8992 ...
##  $ V12  : int  -147 -1392 -1599 57 104 -832 -1393 -1732 1407 1036 ...
##  $ V13  : int  25 25 26 25 25 27 24 23 22 24 ...
##  $ V14  : int  16678 16454 16376 16342 16467 15829 16312 16567 16896 16637 ...
##  $ V15  : int  19590 19592 19592 19644 19622 19502 19200 19188 19254 19054 ...
##  $ V16  : int  -651 -23 -35 -762 159 2752 -30 260 -263 -10 ...
##  $ V17  : int  16298 16442 16612 16752 16888 15262 16132 16396 16608 16754 ...
##  $ V18  : num  351 343 355 353 353 ...
##  $ V19  : int  0 0 0 0 0 3 0 0 0 0 ...
##  $ V20  : int  27758 27632 27622 27528 27550 28142 27864 28072 28140 28068 ...
##  $ V21  : int  49 49 49 49 50 49 48 48 48 49 ...
##  $ id   : chr  "12915" "12915" "12915" "12915" ...
##  $ label: chr  "bad" "bad" "bad" "bad" ...
# Columns are sensor readings. Let's change sensor names with something friendlier. 
names(ds) <- c("Time", "liam", "noah", "ethan", "henry", "aria", "emma",
                 "levi", "finn", "asher", "violet", "ava", "aurora", "lily",
                 "james",  "jack", "teddy",  "logan" ,    
                 "grace",  "isla",  "lucy", "id", "label")

Let’s do some checking and plotting

unique(ds$id)   # since two instances are out (id=3125, id=13122), we now have 129-2=127 instances
##   [1] "12915" "12916" "12917" "12918" "12936" "12937" "12938" "12939"
##   [9] "12940" "13120" "13121" "13141" "13142" "13143" "13318" "13319"
##  [17] "13320" "13339" "13340" "13341" "2901"  "2902"  "2903"  "2904" 
##  [25] "2905"  "2906"  "2907"  "2908"  "2909"  "2910"  "2911"  "2912" 
##  [33] "2913"  "2914"  "2919"  "2920"  "2921"  "2922"  "2923"  "2924" 
##  [41] "2925"  "2926"  "2927"  "2928"  "2929"  "2930"  "2931"  "2932" 
##  [49] "2933"  "2934"  "2935"  "2941"  "2942"  "2943"  "3101"  "3102" 
##  [57] "3103"  "3104"  "3105"  "3106"  "3107"  "3108"  "3109"  "3110" 
##  [65] "3111"  "3112"  "3113"  "3114"  "3115"  "3116"  "3117"  "3118" 
##  [73] "3119"  "3123"  "3124"  "3126"  "3127"  "3128"  "3129"  "3130" 
##  [81] "3131"  "3132"  "3133"  "3134"  "3135"  "3136"  "3137"  "3138" 
##  [89] "3139"  "3140"  "3301"  "3302"  "3303"  "3304"  "3305"  "3306" 
##  [97] "3307"  "3308"  "3309"  "3310"  "3311"  "3312"  "3313"  "3314" 
## [105] "3315"  "3316"  "3317"  "3321"  "3322"  "3323"  "3324"  "3325" 
## [113] "3326"  "3327"  "3328"  "3329"  "3330"  "3331"  "3332"  "3333" 
## [121] "3334"  "3335"  "3336"  "3337"  "3338"  "3342"  "3343"
ds_2901 = ds[ds$id=="2901",]   # a good sample
str(ds_2901)
## 'data.frame':    112 obs. of  23 variables:
##  $ Time  : num  11.9 13 14 15.1 16.1 ...
##  $ liam  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ noah  : num  751 751 751 751 751 751 751 751 751 751 ...
##  $ ethan : num  753 753 755 753 754 753 753 753 753 754 ...
##  $ henry : num  132 134 134 133 132 134 132 134 135 135 ...
##  $ aria  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ emma  : int  626 620 599 586 587 1284 1627 1624 1457 1338 ...
##  $ levi  : num  100 99 102 100 102 101 100 101 100 102 ...
##  $ finn  : num  1227 1229 1221 1201 1182 ...
##  $ asher : int  9408 9431 9389 9445 9456 9406 9405 9421 9384 9410 ...
##  $ violet: int  9019 9029 9114 9031 9043 9086 9091 9066 9144 9009 ...
##  $ ava   : int  -362 -1455 -1056 -587 -124 -417 -1661 -1189 1343 -1016 ...
##  $ aurora: int  26 26 25 25 25 25 26 25 24 25 ...
##  $ lily  : int  16599 16568 16442 16960 16564 16437 16352 16341 16642 16257 ...
##  $ james : int  20028 20042 20146 20148 20226 19478 19544 19590 19416 19614 ...
##  $ jack  : int  -296 -676 -291 -262 -547 1738 -13 452 -455 1158 ...
##  $ teddy : int  16848 16796 16512 17020 16440 17366 16130 16428 16964 16350 ...
##  $ logan : num  360 350 344 352 346 350 346 354 356 354 ...
##  $ grace : int  0 0 0 0 0 1 1 0 0 0 ...
##  $ isla  : int  27594 27440 27276 27330 27262 28178 27560 27974 27906 27750 ...
##  $ lucy  : int  49 49 49 50 50 49 48 48 48 49 ...
##  $ id    : chr  "2901" "2901" "2901" "2901" ...
##  $ label : chr  "good" "good" "good" "good" ...
plot.ts(ds_2901[1:10])

ds_12915 = ds[ds$id=="12915",]   # a bad sample
str(ds_12915)
## 'data.frame':    103 obs. of  23 variables:
##  $ Time  : num  9.18 10.2 11.22 12.24 13.26 ...
##  $ liam  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ noah  : num  751 751 751 751 751 752 751 752 751 751 ...
##  $ ethan : num  753 753 755 752 755 753 753 753 752 754 ...
##  $ henry : num  133 134 134 132 132 135 130 132 132 132 ...
##  $ aria  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ emma  : int  864 833 828 800 780 1378 2323 2320 2021 2089 ...
##  $ levi  : num  101 100 99 101 100 102 100 100 99 102 ...
##  $ finn  : num  1225 1223 1222 1212 1188 ...
##  $ asher : int  9467 9425 9475 9493 9428 9417 9407 9409 9478 9425 ...
##  $ violet: int  8961 8927 8881 8879 8981 8893 8961 8969 8999 8992 ...
##  $ ava   : int  -147 -1392 -1599 57 104 -832 -1393 -1732 1407 1036 ...
##  $ aurora: int  25 25 26 25 25 27 24 23 22 24 ...
##  $ lily  : int  16678 16454 16376 16342 16467 15829 16312 16567 16896 16637 ...
##  $ james : int  19590 19592 19592 19644 19622 19502 19200 19188 19254 19054 ...
##  $ jack  : int  -651 -23 -35 -762 159 2752 -30 260 -263 -10 ...
##  $ teddy : int  16298 16442 16612 16752 16888 15262 16132 16396 16608 16754 ...
##  $ logan : num  351 343 355 353 353 ...
##  $ grace : int  0 0 0 0 0 3 0 0 0 0 ...
##  $ isla  : int  27758 27632 27622 27528 27550 28142 27864 28072 28140 28068 ...
##  $ lucy  : int  49 49 49 49 50 49 48 48 48 49 ...
##  $ id    : chr  "12915" "12915" "12915" "12915" ...
##  $ label : chr  "bad" "bad" "bad" "bad" ...
plot.ts(ds_12915[1:10])  # Hard to differentiate between good and bad visually, but maybe not you.

range( tapply(ds$Time, ds$id, max) ) # latest time indices range from 103.621 ~ 127.260
## [1] 103.621 127.260
hist(tapply(ds$Time, ds$id, max))  # No. of time indices form dist centered around about 111

range(tapply(ds$Time, ds$id, min) )   # 8.668 ~ 11.730
## [1]  8.668 11.730
hist(tapply(ds$Time, ds$id, min) )

range(tapply(ds$Time, ds$id, FUN=function(x) {max(x)-min(x)}))  # 93.90225 ~ 116.88100
## [1]  93.90225 116.88100
# filter those indices between 11.73~103.621 from each samples
# nrow(ds_12915[ds_12915$Time > 11.73 & ds_12915$Time < 103.621,  ]) # 90
filterr <- function(x) {
  x[x$Time > 11.73 & x$Time < 103.621, ]
}
dd = ddply(ds, .(id), filterr )  # nrow(dd[dd$id=="12915", ]) is 90, OK
str(dd)          # 11398 obs.
## 'data.frame':    11398 obs. of  23 variables:
##  $ Time  : num  12.2 13.3 14.3 15.3 16.3 ...
##  $ liam  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ noah  : num  751 751 752 751 752 751 751 751 751 752 ...
##  $ ethan : num  752 755 753 753 753 752 754 753 754 753 ...
##  $ henry : num  132 132 135 130 132 132 132 133 132 132 ...
##  $ aria  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ emma  : int  800 780 1378 2323 2320 2021 2089 2113 2158 2160 ...
##  $ levi  : num  101 100 102 100 100 99 102 101 102 101 ...
##  $ finn  : num  1212 1188 1150 1164 1203 ...
##  $ asher : int  9493 9428 9417 9407 9409 9478 9425 9471 9422 9491 ...
##  $ violet: int  8879 8981 8893 8961 8969 8999 8992 8908 8930 8932 ...
##  $ ava   : int  57 104 -832 -1393 -1732 1407 1036 -1200 -983 -1482 ...
##  $ aurora: int  25 25 27 24 23 22 24 24 24 24 ...
##  $ lily  : int  16342 16467 15829 16312 16567 16896 16637 16706 16643 16290 ...
##  $ james : int  19644 19622 19502 19200 19188 19254 19054 19170 19034 19212 ...
##  $ jack  : int  -762 159 2752 -30 260 -263 -10 11 -297 56 ...
##  $ teddy : int  16752 16888 15262 16132 16396 16608 16754 16170 16706 16564 ...
##  $ logan : num  353 353 346 351 355 ...
##  $ grace : int  0 0 3 0 0 0 0 1 0 0 ...
##  $ isla  : int  27528 27550 28142 27864 28072 28140 28068 28060 28098 28030 ...
##  $ lucy  : int  49 50 49 48 48 48 49 48 49 49 ...
##  $ id    : chr  "12915" "12915" "12915" "12915" ...
##  $ label : chr  "bad" "bad" "bad" "bad" ...
range(ddply(dd, .(id), nrow)[,2]) # each instances have between 87~91 time points within 11.73~103.6 
## [1] 87 91
                                  # time range, so that's about 1 point per second

dd_bad = dd[dd$label=="bad",]  # aggregate bad instances
dd_good = dd[dd$label=="good",]

dd_good_2901 = dd_good[dd_good$id=="2901",]  # get id==2901 instance
pairs(subset(dd_good_2901, select=-c(id, label)))  # can't see very well 

correlationEllipses(cor(subset(dd_good_2901, select=-c(id, label)), use="pairwise.complete.obs")) 
## Loading required package: ellipse

plot.ts(dd_good_2901[1:10])

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2

ggplot() + geom_line(data=dd_good, aes(x=Time, y=finn, color="good"),  size=0.5) +
  geom_line(data=dd_bad, aes(x=Time, y=finn, color="bad"), size=0.5) + 
  scale_colour_discrete(name = "색깔", breaks=c("good", "bad"),
                        labels=c("GOOD", "BAD"))   # Now we are going somewhere!!!

###Let's do loopy plots  

for (i in 2:21) {
  y_label=names(dd_good)[i]
  print( ggplot() + geom_line(aes(x=dd_good$Time, y=dd_good[,i], color="good"),  size=0.5) +
           geom_line(aes(x=dd_bad$Time, y=dd_bad[,i], color="bad"), size=0.5) + 
           xlab("time") + ylab(y_label) +
           scale_colour_discrete(name = "Colors", breaks=c("good", "bad"),
                            labels=c("GOOD", "BAD")) )
}

util.R :

# Utility Library

# usage : correlationEllipses(cor(pwcl.df, use="pairwise.complete.obs"))

correlationEllipses <- function(cor){
  require(ellipse)
  ToRGB <- function(x){rgb(x[1]/255, x[2]/255, x[3]/255)}
  C1 <- ToRGB(c(178, 24, 43))
  C2 <- ToRGB(c(214, 96, 77))
  C3 <- ToRGB(c(244, 165, 130))
  C4 <- ToRGB(c(253, 219, 199))
  C5 <- ToRGB(c(247, 247, 247))
  C6 <- ToRGB(c(209, 229, 240))
  C7 <- ToRGB(c(146, 197, 222))
  C8 <- ToRGB(c(67, 147, 195))
  C9 <- ToRGB(c(33, 102, 172))
  CustomPalette <- colorRampPalette(rev(c(C1, C2, C3, C4, C5, C6, C7, C8, C9)))
  ord <- order(cor[1, ])
  xc <- cor[ord, ord]
  colors <- unlist(CustomPalette(100))
  plotcorr(xc, col=colors[xc * 50 + 50], cex.lab = 0.7*par("cex.lab"), cex=0.5*par("cex"))
}


main