Skip to content

Commit

Permalink
i #313 Updates to Syntax Extractor
Browse files Browse the repository at this point in the history
- Added parameter for excluding licenses in class and file-level comment extraction
- Implemented function extraction for function names with optional parameters
- Implemented variable extraction with optional types
- Added examples for removing empty comments and/or comment delimiters

Signed-off-by: Dao McGill <[email protected]>
  • Loading branch information
daomcgill committed Oct 24, 2024
1 parent 3b84b8e commit 3a4779f
Show file tree
Hide file tree
Showing 7 changed files with 367 additions and 27 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ export(query_src_text)
export(query_src_text_class_docs)
export(query_src_text_class_names)
export(query_src_text_file_docs)
export(query_src_text_functions)
export(query_src_text_namespace)
export(query_src_text_variables)
export(read_temporary_file)
export(recolor_network_by_community)
export(refresh_jira_issues)
Expand Down
182 changes: 173 additions & 9 deletions R/src.R
Original file line number Diff line number Diff line change
Expand Up @@ -444,16 +444,17 @@ query_src_text_namespace <- function(srcml_path,srcml_filepath){
#'
#' @param srcml_path The path to the srcML binary.
#' @param srcml_filepath The path to the srcML file to be queried.
#' @param exclude_license If TRUE, excludes license-related comments. Defaults to TRUE.
#'
#' @return A data.table containing the file path and the file-level documentation comment.
#' @export
query_src_text_file_docs <- function(srcml_path, srcml_filepath) {
query_src_text_file_docs <- function(srcml_path, srcml_filepath, exclude_license = TRUE) {
# Expand paths
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)

# This XPath query is designed to extract meaningful file-level documentation comments
# while excluding the common license blocks found at the top of many source files.
# XPath query with and without license comments, depending on exclude_license parameter
# This query is designed to extract meaningful file-level documentation comments.
# It works as follows:
#
# 1. "//src:unit/src:comment": Selects all <comment> elements within the <unit> (file) node,
Expand All @@ -462,8 +463,11 @@ query_src_text_file_docs <- function(srcml_path, srcml_filepath) {
# 2. "[not(contains(., 'Licensed')) and not(contains(., 'license'))]": Filters out
# comments that contain the text 'Licensed' or 'license', which are typically used
# for licensing information like the Apache License.

xpath_query <- "//src:unit/src:comment[not(contains(., 'Licensed')) and not(contains(., 'license'))]"
if (exclude_license) {
xpath_query <- "//src:unit/src:comment[not(contains(., 'Licensed')) and not(contains(., 'license'))]"
} else {
xpath_query <- "//src:unit/src:comment"
}

# Execute query using the srcML binary
srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath)
Expand Down Expand Up @@ -502,15 +506,17 @@ query_src_text_file_docs <- function(srcml_path, srcml_filepath) {
#'
#' @param srcml_path The path to the srcML binary.
#' @param srcml_filepath The path to the srcML file to be queried.
#' @param exclude_license If TRUE, excludes license-related comments. Defaults to TRUE.
#'
#' @return A data.table containing the file path and the class-level documentation comment.
#' @export
query_src_text_class_docs <- function(srcml_path, srcml_filepath) {
query_src_text_class_docs <- function(srcml_path, srcml_filepath, exclude_license = TRUE) {
# Expand paths
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)

# This XPath query is designed to extract meaningful class-level Javadoc comments
# XPath query with and without license comments, depending on exclude_license parameter
# This query is designed to extract meaningful class-level Javadoc comments
# while avoiding license blocks. It works as follows:
#
# 1. "//src:class": Selects all <class> elements representing class definitions
Expand All @@ -524,8 +530,11 @@ query_src_text_class_docs <- function(srcml_path, srcml_filepath) {
# 3. "/preceding-sibling::src:comment[@format='javadoc'][1]": Once the class is found,
# this selects the first Javadoc comment that directly precedes the class definition
# and is closest to it.

xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc' and not(contains(., 'Licensed'))]]/preceding-sibling::src:comment[@format='javadoc'][1]"
if (exclude_license) {
xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc' and not(contains(., 'Licensed'))]]/preceding-sibling::src:comment[@format='javadoc'][1]"
} else {
xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc']]/preceding-sibling::src:comment[@format='javadoc'][1]"
}

# Execute query
srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath)
Expand All @@ -551,6 +560,161 @@ query_src_text_class_docs <- function(srcml_path, srcml_filepath) {
}


#' Query srcML Variable Declarations
#'
#' This function extracts variable names and their types from the srcML XML file.
#'
#' @param srcml_path The path to the srcML binary.
#' @param srcml_filepath The path to the srcML file to be queried (see \code{\link{annotate_src_text}}).
#' @param var_type If TRUE, includes variable types. Defaults to TRUE.
#'
#' @return A data.table containing filepath, variable_name, and optionally variable_type.
#' @export
query_src_text_variables <- function(srcml_path, srcml_filepath, var_type = TRUE) {
# Expand paths
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)

# Define the XPath query to select variable declarations
xpath_query <- "//src:decl_stmt/src:decl"

# Run the XPath query using the query_src_text function
srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath)

# Parse the XML output
srcml_output <- XML::xmlTreeParse(paste(srcml_output, collapse = "\n"), useInternalNodes = TRUE)
srcml_root <- XML::xmlRoot(srcml_output)

# Get the list of unit nodes (each unit node corresponds to a variable declaration)
srcml_variable_nodes <- XML::xmlChildren(srcml_root)

# Function to parse each unit node and extract variable information
parse_variable_info <- function(unit) {
# Get the filepath from the 'filename' attribute
filepath <- XML::xmlGetAttr(unit, "filename")

# Extract the variable name
# We use XPath to select the <name> element that is not within a <type> element
var_name_node <- XML::getNodeSet(unit, ".//src:name[not(ancestor::src:type)]", namespaces = c(src = "http://www.srcML.org/srcML/src"))[[1]]
variable_name <- XML::xmlValue(var_name_node)

if (var_type) {
# Extract the variable type
var_type_nodes <- XML::getNodeSet(unit, ".//src:type//src:name", namespaces = c(src = "http://www.srcML.org/srcML/src"))
# Concatenate type names if there are multiple parts (e.g., generics)
if (length(var_type_nodes) > 0) {
variable_type <- paste(sapply(var_type_nodes, XML::xmlValue), collapse = "")
} else {
variable_type <- NA
}
# Return a data.table with filepath, variable_name, and variable_type
return(data.table(filepath = filepath, variable_name = variable_name, variable_type = variable_type))
} else {
# Return a data.table with filepath and variable_name
return(data.table(filepath = filepath, variable_name = variable_name))
}
}

# Apply the parsing function to each unit node
dt_variables <- rbindlist(lapply(srcml_variable_nodes, parse_variable_info))

return(dt_variables)
}


#' Query srcML Function Declarations
#'
#' This function extracts function names and optionally their parameters from the srcML XML file.
#'
#' @param srcml_path The path to the srcML binary.
#' @param srcml_filepath The path to the srcML file to be queried (see \code{\link{annotate_src_text}}).
#' @param include_parameters If TRUE, includes function parameters. Defaults to TRUE.
#'
#' @return A data.table containing filepath, function_name, and optionally parameters.
#' @export
query_src_text_functions <- function(srcml_path, srcml_filepath, include_parameters = TRUE) {
# Expand paths
srcml_filepath <- path.expand(srcml_filepath)

# Define the namespace mapping
ns <- c(src = "http://www.srcML.org/srcML/src")

# Parse the XML file
srcml_doc <- XML::xmlParse(srcml_filepath)
srcml_root <- XML::xmlRoot(srcml_doc)

# Get the list of function_decl nodes
function_nodes <- XML::getNodeSet(srcml_root, "//src:function_decl", namespaces = ns)

# Function to parse each function_decl node and extract information
parse_function_info <- function(unit) {
# Get the filepath from the nearest ancestor unit node
unit_ancestor <- XML::xpathApply(unit, "ancestor::src:unit[1]", namespaces = ns)[[1]]
filepath <- XML::xmlGetAttr(unit_ancestor, "filename")

# Extract the function name
function_name_node_list <- XML::getNodeSet(unit, "./src:name", namespaces = ns)
if (length(function_name_node_list) > 0) {
function_name <- XML::xmlValue(function_name_node_list[[1]])
} else {
function_name <- NA
}

if (include_parameters) {
# Extract parameters
param_nodes <- XML::getNodeSet(unit, "./src:parameter_list/src:parameter/src:decl", namespaces = ns)
if (length(param_nodes) > 0) {
# For each parameter, extract type and name
parameters <- sapply(param_nodes, function(param_node) {
# Get parameter type
type_nodes <- XML::getNodeSet(param_node, ".//src:type", namespaces = ns)
if (length(type_nodes) > 0) {
# Extract all name elements within type, including complex types
type_name_nodes <- XML::getNodeSet(type_nodes[[1]], ".//src:name", namespaces = ns)
param_type <- paste(sapply(type_name_nodes, XML::xmlValue), collapse = "")
} else {
param_type <- ""
}

# Get parameter name
name_node <- XML::getNodeSet(param_node, "./src:name", namespaces = ns)
param_name <- if (length(name_node) > 0) {
XML::xmlValue(name_node[[1]])
} else {
""
}

# Combine type and name
paste(param_type, param_name)
})
# Combine all parameters into a single string
parameters <- paste(parameters, collapse = ", ")
} else {
parameters <- ""
}

# Return a data.table with parameters
return(data.table(
filepath = filepath,
function_name = function_name,
parameters = parameters
))
} else {
# Return a data.table without parameters
return(data.table(
filepath = filepath,
function_name = function_name
))
}
}

# Apply the parsing function to each function_decl node
dt_functions <- rbindlist(lapply(function_nodes, parse_function_info))

return(dt_functions)
}



############## GoF Detection ##############

Expand Down
4 changes: 3 additions & 1 deletion man/query_src_text_class_docs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/query_src_text_file_docs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/query_src_text_functions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/query_src_text_variables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3a4779f

Please sign in to comment.