-
Notifications
You must be signed in to change notification settings - Fork 1
/
MainScript.R
151 lines (124 loc) · 4.79 KB
/
MainScript.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
library(XML)
library(tm)
library(jsonlite)
library(slam)
library(igraph)
library(stringr)
library(cluster)
library(cmaes)
library(snakecase)
library(stopwords)
library(NMOF)
library(xtable)
library(Matrix)
library(devtools)
library(SpeedReader)
main_path <- "/home/SSBSE-LDA"
#set the path to your project directory
setwd(main_path)
## 1. load utility functions
source(paste(main_path, "/utilities.R", sep=""))
source(paste(main_path, "/metaheuristics.R", sep=""))
## 2. Set path to the dataset
print("Please select the project to analyze. The available projects are: COLLECTIONS, DATACMNS, HIVE, IO, LANG, MATH, ROO, SEC, SPR, WFLY.")
dataset_name <- readline(prompt="Enter system name: ")
system <- paste("datasets/", dataset_name, sep="")
input_path <- paste(system, "/bugrepo/repository/", sep="");
output_path <- paste(system, "/bugrepo/splitted_repository/", sep="");
## 3. read the oracle
duplicate_graph <- paste(system, "/bugrepo/duplicates.json", sep="");
duplicate_graph <- oracle2graph(duplicate_graph)
#plot(duplicate_graph)
## 4. Set the path to the output directory
file_output <- "Results/results.csv"
files <- list.files(input_path)
## 5. split the main file (cataining all reports) in separate files (one for each report)
# 5.1 Create the folder where to save the (separated reports)
file.exists(output_path)
dir.create(output_path, showWarnings = FALSE)
# 5.2 Split reports in sub-files
split_document(files, output_path)
## 6.1 Choose the fitness function
print("Please select the fitness function to use. The available fitness functions are:
silhouette, coherence, raw_score")
fitness_name = readline(prompt = "Enter fitness function name: ")
## 6. Choose the R package to use
lib = "topicmodels"
if (lib == "textmineR"){
detach_package(topicmodels)
source(paste(main_path, "/textmineR/preprocessing.R", sep=""))
} else if (lib == "topicmodels") {
detach_package(textmineR)
source(paste(main_path, "/topicmodels/preprocessing.R", sep=""))
}
source(paste(main_path, "/fitness_functions.R", sep=""))
source(paste(main_path, "/lda_utilities.R", sep=""))
## 7. create the document by term matrix
tdm <- pre_processing(output_path)
if (lib == "textmineR"){
n_documents = length(dimnames(tdm)[[1]])
} else if (lib == "topicmodels") {
n_documents = length(tdm$dimnames$Docs)
sparse_tdm = convert2SparseMatrix(tdm)
if (fitness_name == "raw_score"){
n.starts <- 1
} else {
n.starts <- 3
}
}
# 8. Meta-heuristic setting
pop_size = 10
n_iterations = 5
lower_bounds <- c(10.0, # n. topics
100.0, # n. iterations
0.0000001, # alpha
0.0000001 # beta
)
upper_bounds <- c(n_documents, # n. topics
200, # n. iterations
1.0, # alpha
1.0 # beta
)
# number in independent runs
numberOfRuns = 100
# 9. Select metaheuristic
print("Please select the meta-heuristic to use. \n
The available meta-heuristics are: CMAES, DE, GA, SA, PSO, RANDOM, LS \n")
metaheuristic_name = readline(prompt = "Enter fitness function name: ")
for (i in 1:numberOfRuns){
print(paste("Run N.", i, sep=" "))
start.time <- proc.time()
random_seeds = sample(1:1000000, n.starts)
r.seeds <- random_seeds
#print(paste("Random seed", random_seed))
#set.seed(random_seed) # setting the random seed for the search
#metaheuristic_name = "SA" # available metaheurstics: GA, DE, RANDOM, SA, PSO
x <- run_metaheurisic(metaheuristic_name, lower_bounds, upper_bounds, n_iterations, pop_size)
end.time <- proc.time()
# if multiple equally-good solutions, just get one
n_solutions = nrow(x)
if (!is.null(n_solutions) && n_solutions>1)
x = x[1,]
# Compute LDA optimized
#set.seed(random_seed) # setting the same random seed used for the search
distances <- evaluate_LDA(x, lib, tdm)
# evaluate the LDA configuration using TOP-k metric
top5 <- topk(duplicate_graph, distances, 5)
top10 <- topk(duplicate_graph, distances, 10)
top15 <- topk(duplicate_graph, distances, 15)
top20 <- topk(duplicate_graph, distances, 20)
print(paste("Top 5 = ", top5))
print(paste("Top 10 = ", top10))
print(paste("Top 15 = ", top15))
print(paste("Top 20 = ", top20))
save_results(system, lib, metaheuristic_name, fitness_name, file_output, x, top5, top10, top15, top20, as.numeric(end.time-start.time)[3])
remove(distances, top5, top10, top15, top20)
}
###
#library(ggplot2)
#tmp <- read.csv(file_output)
#ggplot(tmp, aes(x = tmp$Algorithm, y = tmp$TOP.5, fill = tmp$FitnessFunction)) +
# geom_boxplot() +
# facet_wrap(~tmp$System, scales="free", ncol = 2) +
# ylab("TOP.5") + xlab("Meta-heuristics") +
# labs(fill = "Fitness Functions") + theme(legend.position = c(1, 0), legend.justification = c(1, 0))