Skip to content

Commit

Permalink
i #223 Add GoF Module and QoL for Text Module
Browse files Browse the repository at this point in the history
The GoF Module slighly extends the previous GoF Notebook
API to facilitate integration to other analysis in Kaiaulu.

- De-coupled GoF Parser into Parser/Writer
- Sub-setting of GoF Output from Classes/Methods/Variables -> Classes
- Mapping from Classes to Filepaths
- Reorganization of functions into a dedicated GoF module.

Signed-off-by: Carlos Paradis <[email protected]>
  • Loading branch information
carlosparadis committed Sep 1, 2023
1 parent f350952 commit ad6efa7
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 102 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ export(metric_file_non_bug_churn)
export(metric_file_non_bug_frequency)
export(model_directed_graph)
export(motif_factory)
export(motif_factory_anti_square)
export(motif_factory_anti_triangle)
export(motif_factory_square)
export(motif_factory_triangle)
export(normalized_levenshtein)
Expand Down Expand Up @@ -105,13 +107,15 @@ export(parse_r_function_dependencies)
export(parse_rfile_ast)
export(query_src_text)
export(query_src_text_class_names)
export(query_src_text_namespace)
export(read_temporary_file)
export(recolor_network_by_community)
export(smell_missing_links)
export(smell_organizational_silo)
export(smell_radio_silence)
export(smell_sociotechnical_congruence)
export(split_name_email)
export(subset_gof_class)
export(transform_commit_message_id_to_network)
export(transform_cve_cwe_file_to_network)
export(transform_dependencies_to_network)
Expand All @@ -126,6 +130,7 @@ export(transform_reply_to_bipartite_network)
export(transform_temporal_gitlog_to_adsmj)
export(weight_scheme_count_deleted_nodes)
export(weight_scheme_sum_edges)
export(write_gof_patterns)
importFrom(data.table,":=")
importFrom(data.table,.N)
importFrom(data.table,as.data.table)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __kaiaulu 0.0.0.9700 (in development)__

### NEW FEATURES

* Adds GoF module and various utility functions to facilitate integrating identified pattern classes to files. [#223](https://github.com/sailuh/kaiaulu/issues/223)
* Adds `parse_jira_rss_xml()`, which enables reusing the full 26 projects dataset of our prior TSE work. [#218](https://github.com/sailuh/kaiaulu/issues/218)
* Adds `metric_file_bug_frequency()`, `metric_file_non_bug_frequency()`, `metric_file_bug_churn()`, `metric_file_non_bug_churn()`, `metric_file_churn()` to `R/metric.R` [#214](https://github.com/sailuh/kaiaulu/issues/214)
* Adds Gang of Four parser for Tsantalis' parser4.jar [#211](https://github.com/sailuh/kaiaulu/issues/211)
Expand Down
141 changes: 141 additions & 0 deletions R/gof.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Kaiaulu - https://github.com/sailuh/kaiaulu
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

#' Write GoF Patterns
#'
#' Write GoF patterns generated by `pattern4.jar` into a a table.
#' \url{https://www.srcml.org/documentation.html}.
#' Pattern4.jar is available on
#' [Tsantalis' homepage](https://users.encs.concordia.ca/~nikolaos/pattern_detection.html)).
#'
#' @param pattern4_path The path to Tsantalis' pattern4 jar
#' @param class_folder_path The path to a folder one
#' level above subdirectories that contain the class files.
#' @param output_filepath Optional path to store the XML generated by pattern4. If not
#' specified, it will be saved to `/tmp/gof.xml`.
#'
#' @return A data.table containing the parsed gof patterns per class.
#' @references N. Tsantalis, A. Chatzigeorgiou, G. Stephanides, S. T. Halkidis,
#' "Design Pattern Detection Using Similarity Scoring",
#' IEEE Transactions on Software Engineering,
#' vol. 32, no. 11, pp. 896-909, November, 2006.
#' @export
write_gof_patterns <- function(pattern4_path,class_folder_path,output_filepath='/tmp/gof.xml'){

pattern4_path <- path.expand(pattern4_path)
class_folder_path <- path.expand(class_folder_path)

if(!file.exists(pattern4_path)) stop("The specified pattern4_path does not exist!")
if(!dir.exists(class_folder_path)) stop("The specified class_folder_path does not exist!")

# java -Xms32m -Xmx512m -jar pattern4.jar -target "C:\foo\myclasses" -output "C:\foo\output.xml"
gof_pattern_xml_path <- system2("java",
args = c('-Xms64m','-Xmx100000m','-jar',
pattern4_path,
'-target',paste0('"',class_folder_path,'"'),
'-output',paste0('"',output_filepath,'"')),
stdout = TRUE,
stderr = FALSE)
}

#' Parse GoF Patterns
#'
#' Parses GoF patterns generated by \code{\link{write_gof_patterns}} into a a table.
#' \url{https://www.srcml.org/documentation.html}.
#' Pattern4.jar is available on
#' [Tsantalis' homepage](https://users.encs.concordia.ca/~nikolaos/pattern_detection.html)).
#'
#' @param output_filepath Optional path to read the XML generated by \code{\link{write_gof_patterns}}. If not
#' specified, it will be assumed saved to the temporary folder path `/tmp/gof.xml`.
#'
#' @return A data.table containing the parsed gof patterns per class.
#' @references N. Tsantalis, A. Chatzigeorgiou, G. Stephanides, S. T. Halkidis,
#' "Design Pattern Detection Using Similarity Scoring",
#' IEEE Transactions on Software Engineering,
#' vol. 32, no. 11, pp. 896-909, November, 2006.
#' @export
parse_gof_patterns <- function(output_filepath='/tmp/gof.xml'){
gof_pattern_xml <- XML::xmlTreeParse(output_filepath)

# The <system> root node enumerates a fixed number of <pattern> tags.
gof_root <- XML::xmlRoot(gof_pattern_xml) #class => XML Node
patterns <- XML::xmlChildren(gof_root) #class => XMLNodeList (lapply safe)

parse_instance <- function(instance){

roles <- XML::xmlChildren(instance)
role_names <- sapply(roles,XML::xmlGetAttr,"name")
element <- sapply(roles,XML::xmlGetAttr,"element")

instance <- data.table(instance_id,
role_name = role_names,
element = element)

instance_id <<- instance_id + 1

return(instance)
}

parse_pattern <- function(pattern){
# Each GoF pattern, if occurring on the code, is assigned an instance
n_instances <- XML::xmlSize(pattern)

# The XML mentions the pattern name even with no instances detected. We do not
# include the pattern name if no instances are detected.
if(n_instances > 0){

# Note counter bypasses lapply scope <<-
instance_id <<- 1

pattern_name <- XML::xmlGetAttr(pattern,"name")

instances <- XML::xmlChildren(pattern)
instances_dt <- rbindlist(lapply(instances,parse_instance))
instances_dt$pattern_name <- pattern_name
return(instances_dt)
}else{
return(data.table())
}
}
patterns_dt <- rbindlist(lapply(patterns,parse_pattern))

patterns_dt <- patterns_dt[,.(pattern_name,instance_id,role_name,element)]
return(patterns_dt)
}


#' Subset GoF Classes
#'
#' The \code{\link{write_gof_patterns}} contains not only
#' the participation of a class in a GoF Pattern, but also
#' the participation of methods and variables when applicable.
#' To distinguish a row entry among class, method or variable,
#' we must subset the role names that are associated to classes.
#' This information can be obtained by inspecting the source code
#' of a similar tool to pattern4 by Tsantalis.
#'
#' More specifically, every pattern that pattern4.jar can identify is
#' defined as a PatternDescriptor in DPD4Eclipse/src/gr/uom/java/pattern
#' /PatternGenerator.java (see: https://github.com/tsantalis/DPD4Eclipse).
#'
#' E.g. The PatternDescriptor Decorator has rowNameList.add("Component");
#' rowNameList.add("Decorator"); Therefore, in the XML output by pattern4.jar,
#' it is guaranteed when (pattern_name == "Decorator" & role_name == "Component") or
#' (pattern_name == "Decorator" & role_name == "Decorator").
#'
#' By following this process, a list of role names can be defined to subset the table
#' to only contain classes.
#'
#' Note pattern4 executes in bytecode, hence the classes are identified by their namespace.
#' Refer to \code{\link{query_src_text_namespace}} to obtain a table to map namespace classes
#' to filepaths.
#' @param gof_patterns A table of parsed GoF Patterns
#' obtained from \code{\link{parse_gof_patterns}}.
#' @export
subset_gof_class <- function(gof_patterns){
lead_patterns <- c('Creator', 'Abstraction', 'Adapter', 'Singleton', 'Prototype', 'Decorator', 'AbstractClass', 'Composite', 'Subject', 'State', 'Visitor', 'Strategy', 'Observer', 'Command', 'Handler', 'Component', 'Context', 'Implementor', 'ConcreteElement', 'Prototype', 'Client', 'Proxy', 'RealSubject', 'Subject', 'FamilyHead', 'Redirecter')
return(gof_patterns[role_name %in% lead_patterns])
}
86 changes: 0 additions & 86 deletions R/parser.R
Original file line number Diff line number Diff line change
Expand Up @@ -1537,92 +1537,6 @@ parse_r_dependencies <- function(folder_path){
return(edgelists)
}

#' Parse GoF Patterns
#'
#' Parses GoF patterns generated by `pattern4.jar` into a a table.
#' \url{https://www.srcml.org/documentation.html}.
#' Pattern4.jar is available on
#' [Tsantalis' homepage](https://users.encs.concordia.ca/~nikolaos/pattern_detection.html)).
#'
#' @param pattern4_path The path to Tsantalis' pattern4 jar
#' @param class_folder_path The path to a folder one
#' level above subdirectories that contain the class files.
#' @param output_filepath Optional path to store the XML generated by pattern4. If not
#' specified, it will be saved to `/tmp/gof.xml`.
#'
#' @return A data.table containing the parsed gof patterns per class.
#' @references N. Tsantalis, A. Chatzigeorgiou, G. Stephanides, S. T. Halkidis,
#' "Design Pattern Detection Using Similarity Scoring",
#' IEEE Transactions on Software Engineering,
#' vol. 32, no. 11, pp. 896-909, November, 2006.
#' @export
parse_gof_patterns <- function(pattern4_path,class_folder_path,output_filepath='/tmp/gof.xml'){

pattern4_path <- path.expand(pattern4_path)
class_folder_path <- path.expand(class_folder_path)

if(!file.exists(pattern4_path)) stop("The specified pattern4_path does not exist!")
if(!dir.exists(class_folder_path)) stop("The specified class_folder_path does not exist!")

# java -Xms32m -Xmx512m -jar pattern4.jar -target "C:\foo\myclasses" -output "C:\foo\output.xml"
gof_pattern_xml_path <- system2("java",
args = c('-Xms32m','-Xmx512m','-jar',
pattern4_path,
'-target',paste0('"',class_folder_path,'"'),
'-output',paste0('"',output_filepath,'"')),
stdout = TRUE,
stderr = FALSE)



gof_pattern_xml <- XML::xmlTreeParse(output_filepath)

# The <system> root node enumerates a fixed number of <pattern> tags.
gof_root <- XML::xmlRoot(gof_pattern_xml) #class => XML Node
patterns <- XML::xmlChildren(gof_root) #class => XMLNodeList (lapply safe)

parse_instance <- function(instance){

roles <- XML::xmlChildren(instance)
role_names <- lapply(roles,XML::xmlGetAttr,"name")
element <- lapply(roles,XML::xmlGetAttr,"element")

instance <- data.table(instance_id,
role_name = role_names,
element = element)

instance_id <<- instance_id + 1

return(instance)
}

parse_pattern <- function(pattern){
# Each GoF pattern, if occurring on the code, is assigned an instance
n_instances <- XML::xmlSize(pattern)

# The XML mentions the pattern name even with no instances detected. We do not
# include the pattern name if no instances are detected.
if(n_instances > 0){

# Note counter bypasses lapply scope <<-
instance_id <<- 1

pattern_name <- XML::xmlGetAttr(pattern,"name")

instances <- XML::xmlChildren(pattern)
instances_dt <- rbindlist(lapply(instances,parse_instance))
instances_dt$pattern_name <- pattern_name
return(instances_dt)
}else{
return(data.table())
}
}
patterns_dt <- rbindlist(lapply(patterns,parse_pattern))

patterns_dt <- patterns_dt[,.(pattern_name,instance_id,role_name,element)]
return(patterns_dt)
}

# Various imports
utils::globalVariables(c("."))
#' @importFrom magrittr %>%
Expand Down
53 changes: 53 additions & 0 deletions R/text.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,56 @@ query_src_text_class_names <- function(srcml_path,srcml_filepath){
return(dt_filepath_classname)

}

#' Query srcML Namespace
#'
#' This is a convenience function to parse namespace names out of a project.
#' \url{https://www.srcml.org/documentation.html}.
#'
#'
#' @param srcml_path The path to srcML binary
#' @param srcml_filepath The path to the srcML file to be queried
#' (see \code{\link{annotate_src_text}}).
#'
#' @return A data.table containing Namespace.
#' @references For details, see \url{https://www.srcml.org/documentation.html}.
#' @export
query_src_text_namespace <- function(srcml_path,srcml_filepath){
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)

xpath_query <- "//src:package"

srcml_output <- query_src_text(srcml_path,xpath_query,srcml_filepath)

srcml_output <- XML::xmlTreeParse(srcml_output)
srcml_root <- XML::xmlRoot(srcml_output)

# The children of the root node is a list of unit nodes
srcml_class_names <- XML::xmlChildren(srcml_root)
# Each unit node is of the form:
# <unit revision="1.0.0" language="Java" filename="/Users/lzhan/Desktop/rawdata/git_repo/iotdb/tsfile/src/test/java/org/apache/iotdb/tsfile/read/reader/FakedMultiBatchReader.java" item="1"><package>package <name><name>org</name><operator>.</operator><name>apache</name><operator>.</operator><name>iotdb</name><operator>.</operator><name>tsfile</name><operator>.</operator><name>read</name><operator>.</operator><name>reader</name></name>;</package></unit>


parse_namespace <- function(unit){
# The class name is a child node of each node
class_name <- XML::xmlValue(unit[[1]])
class_name <- sub("^package", "", class_name)
class_name <- sub(";$","",class_name)
# The attribute filename contains the filename the class belongs to
filepath <- XML::xmlGetAttr(unit,"filename")

project_name <- 'iotdb'
# Create a regular expression pattern using project_name
pattern <- paste0('.*?', project_name, '/')
# Get relative path
filepath <- sub(pattern, '', filepath)

filename <- sub("\\.java$", "", basename(filepath))
full_path <- paste(class_name, filename, sep=".")
return(data.table(filepath=filepath, namespace=full_path))
}
dt_filepath_classname <- rbindlist(lapply(srcml_class_names,parse_namespace))

return(dt_filepath_classname)
}
11 changes: 10 additions & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ reference:
- parse_line_metrics
- parse_line_type_file
- parse_commit_message_id
- parse_gof_patterns
- title: __Filters__
desc: Filters for file extensions, and substrings, such as test files, and time intervals.
- contents:
Expand Down Expand Up @@ -65,6 +64,8 @@ reference:
- motif_factory
- motif_factory_triangle
- motif_factory_square
- motif_factory_anti_triangle
- motif_factory_anti_square
- title: __Identity__
desc: Provide the same id to authors who use variations of names or emails.
- contents:
Expand Down Expand Up @@ -202,3 +203,11 @@ reference:
- annotate_src_text
- query_src_text
- query_src_text_class_names
- query_src_text_namespace

- title: __Gang of Four Patterns__
desc: Functions related to Gang of Four Patterns
- contents:
- parse_gof_patterns
- write_gof_patterns
- subset_gof_class
19 changes: 5 additions & 14 deletions man/parse_gof_patterns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ad6efa7

Please sign in to comment.