Update CUSUM_LoMST.R

48cb9b00 · Effi Latiffianti · 17ea48f6 · 48cb9b00
Commit 48cb9b00 authored 2 years ago by Effi Latiffianti
--- a/CUSUM_LoMST.R
+++ b/CUSUM_LoMST.R
-##-------------------------------------------------------------------------------##
-## This code produce cumulative score for each turbine in .csv files             ##
-## Running the whole lines of code requires about 4 hours (16 GB memory laptop)  ##
-##-------------------------------------------------------------------------------##
+##--------------------------------------------------------------------------##
+##            READ ME BEFORE RUNNING THE CODE.                              ##
+##--------------------------------------------------------------------------##
+# This code is set for gearbox. To obtain all detection, this code should be run
+# for each of the component by changing the col in line 75 with the desired component
+# column. Then changed the parameter offset to 0.3 for all components, and 0.35 for
+# hydraulic, 0.2 for transformer. Then change the file name in line 139. 
+# When plotting, the threshold for alarms are:
+# Gbx = 20, Gen = 13.5, Gen.bearing = 8, Hyd.group = 13.5, Transformer = 5
+##--------------------------------------------------------------------------##

-setwd("~/01 - WIND Research/2022 Energies Paper")
+setwd("~/01 - WIND Research/EDP Challenge")
 library(fossil)
 library(dbscan)
 library(dplyr)
@@ -17,22 +23,23 @@ source('LoMSTEDP.R') ##LoMST function (must be copied in the directory)
 #----------------------------------------------------------------#
 signals.train = read.csv("Data/wind-farm-1-signals-training.csv",header = T, as.is = T, sep = ";")
 signals.test = read.csv("Data/wind-farm-1-signals-testing.csv",header = T, as.is = T, sep = ";")
-signals = rbind.data.frame(signals.train,signals.test)
 fail = read.csv("Data/wind-farm-1-failures-training.csv",header = T, as.is = T, sep = ";")
-fail = fail[fail$Turbine_ID=="T07",]
+## add failure in the test set for T07
+fail[24,]=c("T07","HYDRAULIC_GROUP","2017-10-19T10:11:00+00:00","Oil leakage in Hub")
+signals = rbind.data.frame(signals.train,signals.test)

+#--------------------------------------------------------#
+## Make one-hour average
+#--------------------------------------------------------#
 # Averaging hourly data (it takes awhile, around 6 minutes)
 signals$Timestamp=substr(signals$Timestamp,1,nchar(signals$Timestamp)-12)
 All.Timestamp = signals$Timestamp
 All.Timestamp=All.Timestamp[!duplicated(All.Timestamp)]
-col.gbx = c(1,2,12:14,17,22,51) 
-signals <- signals[complete.cases(signals), ]
-signals.gbx = signals[,col.gbx]

 data.list = list()
 turbine = c("T01","T06","T07","T09","T11")
 for (turb in 1:5){
-  data.list[[turb]]=signals.gbx[signals.gbx$Turbine_ID==turbine[turb],]
+  data.list[[turb]]=signals[signals$Turbine_ID==turbine[turb],]
 }

 Hour.avg = data.frame()
@@ -43,37 +50,48 @@ for (i in 1:5){
  row=nrow(Hour.avg)
  Hour.avg[(row+1):(row+n),1]=turbine[i]
  Hour.avg[(row+1):(row+n),2]=date.time
-  for (j in 3:8){
+  for (j in 3:ncol(signals)){
    Hour.avg[(row+1):(row+n),j]=tapply(data[,j],data$Timestamp,mean)
  }
 }
+
 rm(date.time,i,j,n,row,data.list,data)
-names(Hour.avg)=names(signals.gbx)
+names(Hour.avg)=names(signals)

 #--------------------------------------------------------------------------------------------#
 ## Perform LoMST (takes about 3-4 hours depending on the computer)
 #--------------------------------------------------------------------------------------------#
-dat = Hour.avg
+
+## Selected signals for each of the failures component
+col.gbx = c(1,2,12:14,17,22,51) 
+col.hyd =c(1,2,11,14,24,38) #Only detect oil leakage in hydraulic group
+col.Gen = c(1:6,8:10,20,14,22,40,51)
+col.GenB = c(1:10,20,14,22,40,51)
+col.Trf = c(1,2,14,22,40,33:35,51)
+
+## Obtaining anomaly score (only one type of component failure at once)
+col = col.gbx #change this based on the failure type to model
+dat = Hour.Avg
 names(dat)[2]="timestamp"
-dat <- dat[,-1]
+dat <- dat[,col[-1]]
+dat <- dat[complete.cases(dat), ]
 data = dat[,-1]
 data[] <- lapply(data, function(x) as.numeric(as.character(x)))
 data=normalize(data, method = "range", range = c(0, 1), margin = 1L,on.constant = "quiet")
-data=data[complete.cases(data),]
+data <- data[complete.cases(data), ]
 dat <- dat[rownames(dat) %in% rownames(data), ]
-start.time= Sys.time()
 result=as.data.frame(LoMSTEDP (25))
-write.csv(result,"LoMST_Gbx.csv")
+
 #--------------------------------------------------------------------------------------------#
 #       Accumulating the scores for pre-defined accumulation windows
 #--------------------------------------------------------------------------------------------#
-result$Turbine_ID = Hour.avg$Turbine_ID[result$obs]
-threshold = 0.3 
+result$Turbine_ID = Hour.Avg$Turbine_ID[result$obs]
+offset = 0.3 
 turbine = c("T01","T06","T07","T09","T11")
 Cluster.result = list()
 for (turb in 1:5){
  mydata = result[result$Turbine_ID==turbine[turb],]
-  mydata= mydata[mydata$Outlier_Score>=threshold,]
+  mydata= mydata[mydata$Outlier_Score>=offset,]
  sort.data = as.data.frame(arrange(mydata,timestamp))
  sort.data$diff.hrs = 0
  for (i in 2:nrow(mydata)){
@@ -110,4 +128,39 @@ for (turb in 1:5){
  }
 }
 ##Write the Cumulative score
-write.csv(Cluster.result[[3]],"cusum_T07_Gbx.csv")
+write.csv(Cluster.result[[3]],"T07CUSUM_gbx.csv")
+## Repeat for other components
+#write.csv(Cluster.result[[3]],"T07CUSUM_gen.csv")
+#write.csv(Cluster.result[[3]],"T07CUSUM_genb.csv")
+#write.csv(Cluster.result[[3]],"T07CUSUM_hyd.csv")
+#write.csv(Cluster.result[[3]],"T07CUSUM_trf.csv")
+
+## Plotting the results (only after all 5 components were run)
+Cluster=list()
+Cluster[[1]]=read.csv("T07CUSUM_gen.csv")[,-1]
+Cluster[[2]]=read.csv("T07CUSUM_hyd.csv")[,-1]
+Cluster[[3]]=read.csv("T07CUSUM_genb.csv")[,-1]
+Cluster[[4]]=read.csv("T07CUSUM_trf.csv")[,-1]
+Cluster[[5]]=read.csv("T07CUSUM_gbx.csv")[,-1]
+border1 = "2017-07-03T00"
+border = "2017-09-01T00"
+component = fail$Component[!duplicated(fail$Component)]
+thrs = c(13.5,13.5,8,5,20)
+
+path_results = NULL
+pdf(paste0(path_results,'Result_T07.pdf'), height = 8, width =10)
+par(mfrow=c(3,2))
+for (comp in 1:5){
+  mydata = Cluster[[comp]]
+  x2 = as.Date(mydata$timestamp)
+  par(mai = c(0.3,0.7,0.5,0.2))
+  matplot(x2,mydata$Cum.Score,pch=20,col="gray30",ylab="Cumulative Score",xaxt="n",
+          xlim = c(x2[1],x2[length(x2)]),main=turbine[turb.number], xlab="")
+  axis.Date(1, at=seq(x2[1], x2[length(x2)], by="month"), format="%m-%Y")
+  points(x2,mydata$Cum.Score,type="l")
+  abline(v=as.Date(c(border,border1)), lty=2,lwd=1,col="red")
+  idx = which(fail$COMPONENT==component[comp])
+  abline(v=as.Date(fail$Timestamp[idx]),col = "deepskyblue3",lwd=2)
+  abline(h = thrs[comp], col="red")
+}
+dev.off()