-
Notifications
You must be signed in to change notification settings - Fork 1
/
fitness_functions.R
117 lines (105 loc) · 5.02 KB
/
fitness_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(philentropy)
source(paste(main_path, "/lda_utilities.R", sep=""))
# This function implements the Silhouette Coefficient (fitness function to maximize).
# The Silhouette coefficient takes values in [-1; 1]. Higher Silhouette Coefficient
# denotes better clustering quality
fitness_max <- function(x=c()){
n = length(x)
for (index in 1:n){
if (x[index] < lower_bounds[index] || x[index] > upper_bounds[index])
return(-Inf)
}
if (fitness_name == "silhouette"){
lda_model = train_lda(x, lib, tdm)
document2topic = document_by_topic_mixture(lda_model, lib)
# compute the distance between documents in the topics space
distances <- as.matrix(dist(document2topic, method = "euclidean", diag = T, upper = T))
coefficient <- silhouette_coefficient(document2topic, distances)
return(coefficient)
} else if (fitness_name == "coherence"){
lda_model = train_lda(x, lib, tdm)
top_words = get_top_words(lda_model, lib, 10)
numero_topic = round(x[1])
coherence <- matrix(nrow = numero_topic, ncol = 1)
for (i in 1:numero_topic){
coherence[i,1] = SpeedReader::topic_coherence(top_words[,i], sparse_tdm, vocabulary = colnames(sparse_tdm), numeric_top_words = FALSE, K=10)
}
return(mean(coherence, na.rm=T))
} else if (fitness_name == "raw_score") {
n_topics = x[1]
n_runs = 4
n = 7 # number of overlap words
runs <- vector(mode = "list", length = n_runs)
count <- 1
x2 <- x
for (i in 1:n_runs){
tdm <- shuffle_tdm(tdm)
for (j in 1:n_runs){
lda_model = train_lda(x2, lib, tdm)
top_words = get_top_words(lda_model, lib, 10)
runs[[count]] <- list(top_words)
count = count + 1
remove(lda_model)
}
.jcall("java/lang/System", "V", "gc") # call the garbage collector
gc()
}
scores <- matrix(data=0, nrow = 1, ncol = n_topics)
for (topic_index in 1:n_topics){
run1 <- runs[[1]][[1]]
for (run_index in 2:(n_runs*2)){
run2 <- runs[[run_index]][[1]]
count <- overlap_count(run1[,topic_index], run2)
if (max(count) >= n){
scores[topic_index] = scores[topic_index]+1
}
}
}
median(scores/n_runs)
return(median(scores/n_runs))
} else {
stop(paste("Fitness function", fitness_name, "not found!"), call. = TRUE)
}
}
overlap_count <- function(topic, topic_matrix){
n_topics <- ncol(topic_matrix)
count <- matrix(0, 1, n_topics)
for (index in 1:n_topics){
intersection = intersect(topic, topic_matrix[,index])
count[index] = length(intersection)
}
return(count)
}
test_fitness <- function(){
runs <- vector(mode = "list", length = 4)
for (i in 1:4)
runs[[i]] <- list(matrix("",10,4))
runs[[1]][[1]][,1]<-c("glori", "telemetri", "command", "spacecraft", "trace", "smrd", "tim", "spec", "pip", "parent")
runs[[1]][[1]][,2]<-c("spec", "smrd", "parent", "child", "glori", "artifact", "referenc", "verif", "matrix", "data")
runs[[1]][[1]][,3]<-c("test", "case", "accuraci", "roll", "document", "glori", "plan", "pitch", "yaw", "valu")
runs[[1]][[1]][,4]<-c("command", "specifi", "ground", "softwar", "telemetri", "initi", "pip", "data", "configur", "band")
runs[[2]][[1]][,1]<-c("command", "specifi", "ground", "softwar", "initi", "telemetri", "data", "pip", "configur", "band")
runs[[2]][[1]][,2]<-c("document", "test", "glori", "initi", "plan", "roll", "case", "pitch", "yaw", "point")
runs[[2]][[1]][,3]<-c("spec", "smrd", "parent", "child", "glori", "artifact", "verif", "points", "matrix", "spacecraft")
runs[[2]][[1]][,4]<-c("pip", "capabi", "glori", "ground", "command", "smrd", "spec", "tim", "list", "spacecraft")
runs[[3]][[1]][,1]<-c("spec", "smrd", "parent", "child", "glori", "referenc", "artifact", "verif", "matrix", "spacecraft")
runs[[3]][[1]][,2]<-c("command", "telemetri", "pip", "ground", "spacecraft", "mode", "software", "document", "spec", "glori")
runs[[3]][[1]][,3]<-c("command", "specifi", "ground", "softwar", "initi", "telemetri", "data", "configur", "pip", "band")
runs[[3]][[1]][,4]<-c("test", "initi", "accuraci", "glori", "roll", "plan", "matrix", "yaw", "pitch", "valu")
runs[[4]][[1]][,1]<-c("command", "specifi", "tim", "pip", "ground", "telemetri", "glori", "includ", "softwar", "document")
runs[[4]][[1]][,2]<-c("test", "initi", "glori", "plan", "roll", "accuraci", "case", "pip", "point", "yaw")
runs[[4]][[1]][,3]<-c("command", "specifi", "ground", "softwar", "telemetri", "initi", "data", "pip", "configur", "band")
runs[[4]][[1]][,4]<-c("spec", "smrd", "child", "glori", "artifact", "referenc", "parent", "verif", "matrix", "data")
}
jaccard_similarity <- function(v1, v2){
U = union(v1, v2)
I = intersect(v1, v2)
return(length(I)/length(U))
}
# This function is a wrapper around the silhouette coefficient. Minimizing this function corresponds
# to maximizing the original silhouette coefficient. Meta-heuristics that minimize the fitness
# function should use this function.
fitness_min <- function(x=c()){
value <- (-fitness_max(x))
return(value)
}