Skip to content

Commit

Permalink
Merge pull request #481 from OmarAI2003/check_SELECT_WHERE_labels_ORDER
Browse files Browse the repository at this point in the history
Check select where labels order
  • Loading branch information
andrewtavis authored Oct 26, 2024
2 parents 86676b0 + 743e81c commit a487a18
Show file tree
Hide file tree
Showing 13 changed files with 66 additions and 11 deletions.
61 changes: 61 additions & 0 deletions src/scribe_data/check/check_query_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,61 @@ def check_defined_return_forms(query_text: str) -> str:
return ""


# MARK: Forms Order


def check_forms_order(query_text: str) -> bool:
"""
Checks that the order of variables in the SELECT statement (excluding lexeme and lexemeID)
matches the order of the same variables in the WHERE clause in the given SPARQL query file.
Parameters
----------
query_file : str
The SPARQL query text as a string.
Returns
-------
bool
True if the order of the matches, False otherwise.
"""
select_pattern = r"SELECT\s+(.*?)\s+WHERE"

# Extracting the variables from the SELECT statement.
if select_match := re.search(select_pattern, query_text, flags=re.DOTALL):
select_vars = re.findall(r"\?(\w+)", select_match[1])

else:
return False # invalid query format if no SELECT match

# Exclude the first two variables from select_vars.
select_vars = select_vars[2:]
# Regex pattern to capture the variables in the WHERE clause.
dt_pattern = r"WHERE\s*\{[^}]*?wikibase:lemma\s*\?\s*(\w+)\s*[;.]\s*"
forms_pattern = r"ontolex:representation \?([^ ;]+)"
where_vars = []

# Extracting variables from the WHERE clause.
dt_match = re.findall(dt_pattern, query_text)
if dt_match == ["lemma"]:
where_vars.append("preposition")

elif dt_match:
where_vars.append(dt_match[0])

where_vars += re.findall(forms_pattern, query_text)

# Handling labels provided by the labeling service like 'case' and 'gender' in the same order as in select_vars.
for var in ["case", "gender", "auxiliaryVerb"]:
if var in select_vars:
# Insert in the corresponding index of where_vars.
index = select_vars.index(var)
where_vars.insert(index, var)

# Check if the order of variables matches.
return select_vars == where_vars


# MARK: Main Query Forms Validation
def check_query_forms() -> None:
"""
Expand Down Expand Up @@ -471,6 +526,12 @@ def check_query_forms() -> None:
error_output += f"\n{index}. {query_file_str}: {defined_unreturned_forms}\n"
index += 1

# Check the order of variables in the WHERE and SELECT clauses.
select_where_labels_matching = check_forms_order(query_text)
if not select_where_labels_matching:
error_output += f"\n{index}. {query_file_str}:\n - The order of variables in the SELECT statement does not match their order in the query.\n"
index += 1

if extract_forms_from_sparql(query_file):
query_form_check_dict = {}
for form_text in extract_forms_from_sparql(query_file):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ WHERE {

?lexeme dct:language wd:Q9035 ;
wikibase:lexicalCategory wd:Q24905 ;
wikibase:lemma ?infinitive
wikibase:lemma ?infinitive .

# MARK: Infinitive Active

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adjective
?adessiveSingular
?adessivePlural
?ablativeSingular
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adjective
?essiveSingular
?essivePlural
?abessiveSingular
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?infinitive
?feminineImperativeSecondPersonSingular
?masculineImperativeSecondPersonSingular
?feminineImperativeSecondPersonPlural
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?postposition

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?postposition

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?singular
?plural
?gender

WHERE {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "pnb" to select Shahmukhi words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?singular
?plural
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# Note: We need to filter for "pnb" to select Shahmukhi words.

SELECT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?singular
?plural
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?lemma
?adjective
?nominativeFeminineSingular
?nominativeMasculineSingular
?nominativeNeuterSingular
Expand All @@ -15,7 +15,7 @@ SELECT
WHERE {
?lexeme dct:language wd:Q8798 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?lemma .
wikibase:lemma ?adjective .

OPTIONAL {
?lexeme ontolex:lexicalForm ?nominativeFeminineSingularForm .
Expand Down

0 comments on commit a487a18

Please sign in to comment.