20, 30대 읽던 글들을 이젠 이해하기가 어렵다. 산만한 정신에 빠른 지름길을 찾으려고만 하니.
졸업할 때 signal ensemble을 다루는 일을 직업으로 할 기회가 있었는데, 그 길을 가지 않았다. 그 길을 택했으면 지금보다 더 나은 사람이 되었을 것인가는 불분명하지만, 지금보다 훨씬 더 나은 사정에 있었을 것이다.
이제 그 때보다 능력은 훨씬 못하고, 주위에 동료는 없으나, 나 자신을 위한 것이니 아예 못하기 전 정리를 시작해 보자.
Purpose of this exercise : Get to know how multivariate signal/TS can be represented.
Good/Bad component classification based on multiple multivariate time series sensor data
- Bad component data : “Bad_Karma”
- Good component data : “Good_Karma”
1. Read in data
library(stringr)
source("util.R")
files_bad_components <- list.files(path=paste(getwd(), "/", "Bad_Karma", sep=""), full.names=T)
data = data.frame()
for (file in files_bad_components) {
file.temp <- read.table(file, header=F, sep=",")
file.temp$id <- str_sub(file, start= -9, end=-5) # (뒤)에서 9번째 부터 (뒤)에서 5번째 character 까지
file.temp$label <- "bad"
data = rbind(data, file.temp)
}
files_good_components <- list.files(path=paste(getwd(), "/", "Good_Karma", sep=""), full.names=T)
for (file in files_good_components) {
file.temp <- read.table(file, header=F, sep=",")
file.temp$id <- str_sub(file, start= -8, end=-5) # (뒤)에서 8번째 부터 (뒤)에서 5번째 character 까지
file.temp$label <- "good"
data = rbind(data, file.temp)
}
unique(data$id) # There are 129 instances of MTS type
## [1] "12915" "12916" "12917" "12918" "12936" "12937" "12938" "12939"
## [9] "12940" "13120" "13121" "13122" "13141" "13142" "13143" "13318"
## [17] "13319" "13320" "13339" "13340" "13341" "2901" "2902" "2903"
## [25] "2904" "2905" "2906" "2907" "2908" "2909" "2910" "2911"
## [33] "2912" "2913" "2914" "2919" "2920" "2921" "2922" "2923"
## [41] "2924" "2925" "2926" "2927" "2928" "2929" "2930" "2931"
## [49] "2932" "2933" "2934" "2935" "2941" "2942" "2943" "3101"
## [57] "3102" "3103" "3104" "3105" "3106" "3107" "3108" "3109"
## [65] "3110" "3111" "3112" "3113" "3114" "3115" "3116" "3117"
## [73] "3118" "3119" "3123" "3124" "3125" "3126" "3127" "3128"
## [81] "3129" "3130" "3131" "3132" "3133" "3134" "3135" "3136"
## [89] "3137" "3138" "3139" "3140" "3301" "3302" "3303" "3304"
## [97] "3305" "3306" "3307" "3308" "3309" "3310" "3311" "3312"
## [105] "3313" "3314" "3315" "3316" "3317" "3321" "3322" "3323"
## [113] "3324" "3325" "3326" "3327" "3328" "3329" "3330" "3331"
## [121] "3332" "3333" "3334" "3335" "3336" "3337" "3338" "3342"
## [129] "3343"
data loading is done, and let’s do some cleansing
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.2
tt = ddply(data, .(id), nrow) # how many rows(time indices) are there for each instances
str(tt)
## 'data.frame': 129 obs. of 2 variables:
## $ id: chr "12915" "12916" "12917" "12918" ...
## $ V1: int 103 106 102 93 98 102 106 104 99 100 ...
hist(tt$V1) # most observation have about 100 data points
head( tt[order(tt$V1, decreasing=T), ], 20 )# MTS instance id=2901 has the longest time series points with 112
## id V1
## 22 2901 112
## 17 13319 108
## 56 3101 108
## 23 2902 107
## 27 2906 107
## 2 12916 106
## 7 12938 106
## 24 2903 106
## 28 2907 106
## 29 2908 106
## 58 3103 106
## 26 2905 105
## 30 2909 105
## 57 3102 105
## 8 12939 104
## 25 2904 104
## 31 2910 104
## 32 2911 104
## 33 2912 104
## 34 2913 104
head( tt[order(tt$V1, decreasing=F), ], 20 ) # instance id=3125 has 3 points. id=13122 has 56. get rid of them
## id V1
## 77 3125 3
## 12 13122 56
## 14 13142 92
## 4 12918 93
## 11 13121 95
## 20 13340 95
## 114 3325 95
## 115 3326 95
## 116 3327 95
## 118 3329 95
## 121 3332 95
## 125 3336 95
## 128 3342 95
## 119 3330 96
## 21 13341 97
## 54 2942 97
## 129 3343 97
## 5 12936 98
## 53 2941 98
## 80 3128 98
ds = data[data$id != 3125 & data$id != 13122,] # ds is the new filtered data set
str(ds) # 12829 - 3 - 56 = 12770 , OK
## 'data.frame': 12770 obs. of 23 variables:
## $ V1 : num 9.18 10.2 11.22 12.24 13.26 ...
## $ V2 : int 4 4 4 4 4 4 4 4 4 4 ...
## $ V3 : num 751 751 751 751 751 752 751 752 751 751 ...
## $ V4 : num 753 753 755 752 755 753 753 753 752 754 ...
## $ V5 : num 133 134 134 132 132 135 130 132 132 132 ...
## $ V6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V7 : int 864 833 828 800 780 1378 2323 2320 2021 2089 ...
## $ V8 : num 101 100 99 101 100 102 100 100 99 102 ...
## $ V9 : num 1225 1223 1222 1212 1188 ...
## $ V10 : int 9467 9425 9475 9493 9428 9417 9407 9409 9478 9425 ...
## $ V11 : int 8961 8927 8881 8879 8981 8893 8961 8969 8999 8992 ...
## $ V12 : int -147 -1392 -1599 57 104 -832 -1393 -1732 1407 1036 ...
## $ V13 : int 25 25 26 25 25 27 24 23 22 24 ...
## $ V14 : int 16678 16454 16376 16342 16467 15829 16312 16567 16896 16637 ...
## $ V15 : int 19590 19592 19592 19644 19622 19502 19200 19188 19254 19054 ...
## $ V16 : int -651 -23 -35 -762 159 2752 -30 260 -263 -10 ...
## $ V17 : int 16298 16442 16612 16752 16888 15262 16132 16396 16608 16754 ...
## $ V18 : num 351 343 355 353 353 ...
## $ V19 : int 0 0 0 0 0 3 0 0 0 0 ...
## $ V20 : int 27758 27632 27622 27528 27550 28142 27864 28072 28140 28068 ...
## $ V21 : int 49 49 49 49 50 49 48 48 48 49 ...
## $ id : chr "12915" "12915" "12915" "12915" ...
## $ label: chr "bad" "bad" "bad" "bad" ...
# Columns are sensor readings. Let's change sensor names with something friendlier.
names(ds) <- c("Time", "liam", "noah", "ethan", "henry", "aria", "emma",
"levi", "finn", "asher", "violet", "ava", "aurora", "lily",
"james", "jack", "teddy", "logan" ,
"grace", "isla", "lucy", "id", "label")
Let’s do some checking and plotting
unique(ds$id) # since two instances are out (id=3125, id=13122), we now have 129-2=127 instances
## [1] "12915" "12916" "12917" "12918" "12936" "12937" "12938" "12939"
## [9] "12940" "13120" "13121" "13141" "13142" "13143" "13318" "13319"
## [17] "13320" "13339" "13340" "13341" "2901" "2902" "2903" "2904"
## [25] "2905" "2906" "2907" "2908" "2909" "2910" "2911" "2912"
## [33] "2913" "2914" "2919" "2920" "2921" "2922" "2923" "2924"
## [41] "2925" "2926" "2927" "2928" "2929" "2930" "2931" "2932"
## [49] "2933" "2934" "2935" "2941" "2942" "2943" "3101" "3102"
## [57] "3103" "3104" "3105" "3106" "3107" "3108" "3109" "3110"
## [65] "3111" "3112" "3113" "3114" "3115" "3116" "3117" "3118"
## [73] "3119" "3123" "3124" "3126" "3127" "3128" "3129" "3130"
## [81] "3131" "3132" "3133" "3134" "3135" "3136" "3137" "3138"
## [89] "3139" "3140" "3301" "3302" "3303" "3304" "3305" "3306"
## [97] "3307" "3308" "3309" "3310" "3311" "3312" "3313" "3314"
## [105] "3315" "3316" "3317" "3321" "3322" "3323" "3324" "3325"
## [113] "3326" "3327" "3328" "3329" "3330" "3331" "3332" "3333"
## [121] "3334" "3335" "3336" "3337" "3338" "3342" "3343"
ds_2901 = ds[ds$id=="2901",] # a good sample
str(ds_2901)
## 'data.frame': 112 obs. of 23 variables:
## $ Time : num 11.9 13 14 15.1 16.1 ...
## $ liam : int 4 4 4 4 4 4 4 4 4 4 ...
## $ noah : num 751 751 751 751 751 751 751 751 751 751 ...
## $ ethan : num 753 753 755 753 754 753 753 753 753 754 ...
## $ henry : num 132 134 134 133 132 134 132 134 135 135 ...
## $ aria : int 0 0 0 0 0 0 0 0 1 0 ...
## $ emma : int 626 620 599 586 587 1284 1627 1624 1457 1338 ...
## $ levi : num 100 99 102 100 102 101 100 101 100 102 ...
## $ finn : num 1227 1229 1221 1201 1182 ...
## $ asher : int 9408 9431 9389 9445 9456 9406 9405 9421 9384 9410 ...
## $ violet: int 9019 9029 9114 9031 9043 9086 9091 9066 9144 9009 ...
## $ ava : int -362 -1455 -1056 -587 -124 -417 -1661 -1189 1343 -1016 ...
## $ aurora: int 26 26 25 25 25 25 26 25 24 25 ...
## $ lily : int 16599 16568 16442 16960 16564 16437 16352 16341 16642 16257 ...
## $ james : int 20028 20042 20146 20148 20226 19478 19544 19590 19416 19614 ...
## $ jack : int -296 -676 -291 -262 -547 1738 -13 452 -455 1158 ...
## $ teddy : int 16848 16796 16512 17020 16440 17366 16130 16428 16964 16350 ...
## $ logan : num 360 350 344 352 346 350 346 354 356 354 ...
## $ grace : int 0 0 0 0 0 1 1 0 0 0 ...
## $ isla : int 27594 27440 27276 27330 27262 28178 27560 27974 27906 27750 ...
## $ lucy : int 49 49 49 50 50 49 48 48 48 49 ...
## $ id : chr "2901" "2901" "2901" "2901" ...
## $ label : chr "good" "good" "good" "good" ...
plot.ts(ds_2901[1:10])
ds_12915 = ds[ds$id=="12915",] # a bad sample
str(ds_12915)
## 'data.frame': 103 obs. of 23 variables:
## $ Time : num 9.18 10.2 11.22 12.24 13.26 ...
## $ liam : int 4 4 4 4 4 4 4 4 4 4 ...
## $ noah : num 751 751 751 751 751 752 751 752 751 751 ...
## $ ethan : num 753 753 755 752 755 753 753 753 752 754 ...
## $ henry : num 133 134 134 132 132 135 130 132 132 132 ...
## $ aria : int 0 0 0 0 0 0 0 0 0 0 ...
## $ emma : int 864 833 828 800 780 1378 2323 2320 2021 2089 ...
## $ levi : num 101 100 99 101 100 102 100 100 99 102 ...
## $ finn : num 1225 1223 1222 1212 1188 ...
## $ asher : int 9467 9425 9475 9493 9428 9417 9407 9409 9478 9425 ...
## $ violet: int 8961 8927 8881 8879 8981 8893 8961 8969 8999 8992 ...
## $ ava : int -147 -1392 -1599 57 104 -832 -1393 -1732 1407 1036 ...
## $ aurora: int 25 25 26 25 25 27 24 23 22 24 ...
## $ lily : int 16678 16454 16376 16342 16467 15829 16312 16567 16896 16637 ...
## $ james : int 19590 19592 19592 19644 19622 19502 19200 19188 19254 19054 ...
## $ jack : int -651 -23 -35 -762 159 2752 -30 260 -263 -10 ...
## $ teddy : int 16298 16442 16612 16752 16888 15262 16132 16396 16608 16754 ...
## $ logan : num 351 343 355 353 353 ...
## $ grace : int 0 0 0 0 0 3 0 0 0 0 ...
## $ isla : int 27758 27632 27622 27528 27550 28142 27864 28072 28140 28068 ...
## $ lucy : int 49 49 49 49 50 49 48 48 48 49 ...
## $ id : chr "12915" "12915" "12915" "12915" ...
## $ label : chr "bad" "bad" "bad" "bad" ...
plot.ts(ds_12915[1:10]) # Hard to differentiate between good and bad visually, but maybe not you.
range( tapply(ds$Time, ds$id, max) ) # latest time indices range from 103.621 ~ 127.260
## [1] 103.621 127.260
hist(tapply(ds$Time, ds$id, max)) # No. of time indices form dist centered around about 111
range(tapply(ds$Time, ds$id, min) ) # 8.668 ~ 11.730
## [1] 8.668 11.730
hist(tapply(ds$Time, ds$id, min) )
range(tapply(ds$Time, ds$id, FUN=function(x) {max(x)-min(x)})) # 93.90225 ~ 116.88100
## [1] 93.90225 116.88100
# filter those indices between 11.73~103.621 from each samples
# nrow(ds_12915[ds_12915$Time > 11.73 & ds_12915$Time < 103.621, ]) # 90
filterr <- function(x) {
x[x$Time > 11.73 & x$Time < 103.621, ]
}
dd = ddply(ds, .(id), filterr ) # nrow(dd[dd$id=="12915", ]) is 90, OK
str(dd) # 11398 obs.
## 'data.frame': 11398 obs. of 23 variables:
## $ Time : num 12.2 13.3 14.3 15.3 16.3 ...
## $ liam : int 4 4 4 4 4 4 4 4 4 4 ...
## $ noah : num 751 751 752 751 752 751 751 751 751 752 ...
## $ ethan : num 752 755 753 753 753 752 754 753 754 753 ...
## $ henry : num 132 132 135 130 132 132 132 133 132 132 ...
## $ aria : int 0 0 0 0 0 0 0 0 0 0 ...
## $ emma : int 800 780 1378 2323 2320 2021 2089 2113 2158 2160 ...
## $ levi : num 101 100 102 100 100 99 102 101 102 101 ...
## $ finn : num 1212 1188 1150 1164 1203 ...
## $ asher : int 9493 9428 9417 9407 9409 9478 9425 9471 9422 9491 ...
## $ violet: int 8879 8981 8893 8961 8969 8999 8992 8908 8930 8932 ...
## $ ava : int 57 104 -832 -1393 -1732 1407 1036 -1200 -983 -1482 ...
## $ aurora: int 25 25 27 24 23 22 24 24 24 24 ...
## $ lily : int 16342 16467 15829 16312 16567 16896 16637 16706 16643 16290 ...
## $ james : int 19644 19622 19502 19200 19188 19254 19054 19170 19034 19212 ...
## $ jack : int -762 159 2752 -30 260 -263 -10 11 -297 56 ...
## $ teddy : int 16752 16888 15262 16132 16396 16608 16754 16170 16706 16564 ...
## $ logan : num 353 353 346 351 355 ...
## $ grace : int 0 0 3 0 0 0 0 1 0 0 ...
## $ isla : int 27528 27550 28142 27864 28072 28140 28068 28060 28098 28030 ...
## $ lucy : int 49 50 49 48 48 48 49 48 49 49 ...
## $ id : chr "12915" "12915" "12915" "12915" ...
## $ label : chr "bad" "bad" "bad" "bad" ...
range(ddply(dd, .(id), nrow)[,2]) # each instances have between 87~91 time points within 11.73~103.6
## [1] 87 91
# time range, so that's about 1 point per second
dd_bad = dd[dd$label=="bad",] # aggregate bad instances
dd_good = dd[dd$label=="good",]
dd_good_2901 = dd_good[dd_good$id=="2901",] # get id==2901 instance
pairs(subset(dd_good_2901, select=-c(id, label))) # can't see very well
correlationEllipses(cor(subset(dd_good_2901, select=-c(id, label)), use="pairwise.complete.obs"))
## Loading required package: ellipse
plot.ts(dd_good_2901[1:10])
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
ggplot() + geom_line(data=dd_good, aes(x=Time, y=finn, color="good"), size=0.5) +
geom_line(data=dd_bad, aes(x=Time, y=finn, color="bad"), size=0.5) +
scale_colour_discrete(name = "색깔", breaks=c("good", "bad"),
labels=c("GOOD", "BAD")) # Now we are going somewhere!!!
###Let's do loopy plots
for (i in 2:21) {
y_label=names(dd_good)[i]
print( ggplot() + geom_line(aes(x=dd_good$Time, y=dd_good[,i], color="good"), size=0.5) +
geom_line(aes(x=dd_bad$Time, y=dd_bad[,i], color="bad"), size=0.5) +
xlab("time") + ylab(y_label) +
scale_colour_discrete(name = "Colors", breaks=c("good", "bad"),
labels=c("GOOD", "BAD")) )
}
util.R :
# Utility Library
# usage : correlationEllipses(cor(pwcl.df, use="pairwise.complete.obs"))
correlationEllipses <- function(cor){
require(ellipse)
ToRGB <- function(x){rgb(x[1]/255, x[2]/255, x[3]/255)}
C1 <- ToRGB(c(178, 24, 43))
C2 <- ToRGB(c(214, 96, 77))
C3 <- ToRGB(c(244, 165, 130))
C4 <- ToRGB(c(253, 219, 199))
C5 <- ToRGB(c(247, 247, 247))
C6 <- ToRGB(c(209, 229, 240))
C7 <- ToRGB(c(146, 197, 222))
C8 <- ToRGB(c(67, 147, 195))
C9 <- ToRGB(c(33, 102, 172))
CustomPalette <- colorRampPalette(rev(c(C1, C2, C3, C4, C5, C6, C7, C8, C9)))
ord <- order(cor[1, ])
xc <- cor[ord, ord]
colors <- unlist(CustomPalette(100))
plotcorr(xc, col=colors[xc * 50 + 50], cex.lab = 0.7*par("cex.lab"), cex=0.5*par("cex"))
}
'Learning & Reasoning > R ' 카테고리의 다른 글
A simple time series clustering (0) | 2015.04.24 |
---|---|
Signal and time series seen from eight miles high cloud - DFT & Simple digital filtering (0) | 2015.02.18 |
Supervised Learning with R (0) | 2014.08.02 |
쿨한 machine learning (0) | 2013.08.19 |
빨리 진도 나가야 하는데... (0) | 2013.02.28 |