diff --git a/src/scribe_data/check/check_query_forms.py b/src/scribe_data/check/check_query_forms.py index b4ab72d07..9495563fd 100644 --- a/src/scribe_data/check/check_query_forms.py +++ b/src/scribe_data/check/check_query_forms.py @@ -438,6 +438,61 @@ def check_defined_return_forms(query_text: str) -> str: return "" +# MARK: Forms Order + + +def check_forms_order(query_text: str) -> bool: + """ + Checks that the order of variables in the SELECT statement (excluding lexeme and lexemeID) + matches the order of the same variables in the WHERE clause in the given SPARQL query file. + + Parameters + ---------- + query_file : str + The SPARQL query text as a string. + + Returns + ------- + bool + True if the order of the matches, False otherwise. + """ + select_pattern = r"SELECT\s+(.*?)\s+WHERE" + + # Extracting the variables from the SELECT statement. + if select_match := re.search(select_pattern, query_text, flags=re.DOTALL): + select_vars = re.findall(r"\?(\w+)", select_match[1]) + + else: + return False # invalid query format if no SELECT match + + # Exclude the first two variables from select_vars. + select_vars = select_vars[2:] + # Regex pattern to capture the variables in the WHERE clause. + dt_pattern = r"WHERE\s*\{[^}]*?wikibase:lemma\s*\?\s*(\w+)\s*[;.]\s*" + forms_pattern = r"ontolex:representation \?([^ ;]+)" + where_vars = [] + + # Extracting variables from the WHERE clause. + dt_match = re.findall(dt_pattern, query_text) + if dt_match == ["lemma"]: + where_vars.append("preposition") + + elif dt_match: + where_vars.append(dt_match[0]) + + where_vars += re.findall(forms_pattern, query_text) + + # Handling labels provided by the labeling service like 'case' and 'gender' in the same order as in select_vars. + for var in ["case", "gender", "auxiliaryVerb"]: + if var in select_vars: + # Insert in the corresponding index of where_vars. + index = select_vars.index(var) + where_vars.insert(index, var) + + # Check if the order of variables matches. + return select_vars == where_vars + + # MARK: Main Query Forms Validation def check_query_forms() -> None: """ @@ -471,6 +526,12 @@ def check_query_forms() -> None: error_output += f"\n{index}. {query_file_str}: {defined_unreturned_forms}\n" index += 1 + # Check the order of variables in the WHERE and SELECT clauses. + select_where_labels_matching = check_forms_order(query_text) + if not select_where_labels_matching: + error_output += f"\n{index}. {query_file_str}:\n - The order of variables in the SELECT statement does not match their order in the query.\n" + index += 1 + if extract_forms_from_sparql(query_file): query_form_check_dict = {} for form_text in extract_forms_from_sparql(query_file): diff --git a/src/scribe_data/wikidata/language_data_extraction/danish/verbs/query_verbs.sparql b/src/scribe_data/wikidata/language_data_extraction/danish/verbs/query_verbs.sparql index da4336526..6fdb97819 100644 --- a/src/scribe_data/wikidata/language_data_extraction/danish/verbs/query_verbs.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/danish/verbs/query_verbs.sparql @@ -20,7 +20,7 @@ WHERE { ?lexeme dct:language wd:Q9035 ; wikibase:lexicalCategory wd:Q24905 ; - wikibase:lemma ?infinitive + wikibase:lemma ?infinitive . # MARK: Infinitive Active diff --git a/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_3.sparql b/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_3.sparql index 7d2864d76..1e1dc9237 100644 --- a/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_3.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_3.sparql @@ -4,6 +4,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective ?adessiveSingular ?adessivePlural ?ablativeSingular diff --git a/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_4.sparql b/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_4.sparql index 66f545532..b4f9f002d 100644 --- a/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_4.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/estonian/adjectives/query_adjectives_4.sparql @@ -4,6 +4,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective ?essiveSingular ?essivePlural ?abessiveSingular diff --git a/src/scribe_data/wikidata/language_data_extraction/hebrew/verbs/query_verbs_2.sparql b/src/scribe_data/wikidata/language_data_extraction/hebrew/verbs/query_verbs_2.sparql index 6a30175f1..d12bc9f38 100644 --- a/src/scribe_data/wikidata/language_data_extraction/hebrew/verbs/query_verbs_2.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/hebrew/verbs/query_verbs_2.sparql @@ -4,7 +4,6 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) - ?infinitive ?feminineImperativeSecondPersonSingular ?masculineImperativeSecondPersonSingular ?feminineImperativeSecondPersonPlural diff --git a/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/postpositions/query_postpositions.sparql b/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/postpositions/query_postpositions.sparql index 9416e0e9c..b53284b41 100644 --- a/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/postpositions/query_postpositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/postpositions/query_postpositions.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?postposition diff --git a/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/prepositions/query_prepositions.sparql index 5df65a582..df271bd3e 100644 --- a/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/hindustani/hindi/prepositions/query_prepositions.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition diff --git a/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/postpositions/query_postpositions.sparql b/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/postpositions/query_postpositions.sparql index f55f172af..66abfa087 100644 --- a/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/postpositions/query_postpositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/postpositions/query_postpositions.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?postposition diff --git a/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/prepositions/query_prepositions.sparql b/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/prepositions/query_prepositions.sparql index 9cb4d03f2..2ff0ef368 100644 --- a/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/prepositions/query_prepositions.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/hindustani/urdu/prepositions/query_prepositions.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition diff --git a/src/scribe_data/wikidata/language_data_extraction/italian/proper_nouns/query_proper_nouns.sparql b/src/scribe_data/wikidata/language_data_extraction/italian/proper_nouns/query_proper_nouns.sparql index e24635171..89626346f 100644 --- a/src/scribe_data/wikidata/language_data_extraction/italian/proper_nouns/query_proper_nouns.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/italian/proper_nouns/query_proper_nouns.sparql @@ -5,7 +5,6 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular - ?plural ?gender WHERE { diff --git a/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/nouns/query_nouns.sparql b/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/nouns/query_nouns.sparql index 107d7e513..10b67e3c8 100644 --- a/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/nouns/query_nouns.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/nouns/query_nouns.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "pnb" to select Shahmukhi words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/proper_nouns/query_proper_nouns.sparql b/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/proper_nouns/query_proper_nouns.sparql index 40c90d7c3..9ea37f6c5 100644 --- a/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/proper_nouns/query_proper_nouns.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/punjabi/shahmukhi/proper_nouns/query_proper_nouns.sparql @@ -5,7 +5,6 @@ # Note: We need to filter for "pnb" to select Shahmukhi words. SELECT - ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/wikidata/language_data_extraction/ukrainian/adjectives/query_adjectives.sparql b/src/scribe_data/wikidata/language_data_extraction/ukrainian/adjectives/query_adjectives.sparql index 79797ab64..1251289e7 100644 --- a/src/scribe_data/wikidata/language_data_extraction/ukrainian/adjectives/query_adjectives.sparql +++ b/src/scribe_data/wikidata/language_data_extraction/ukrainian/adjectives/query_adjectives.sparql @@ -4,7 +4,7 @@ SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) - ?lemma + ?adjective ?nominativeFeminineSingular ?nominativeMasculineSingular ?nominativeNeuterSingular @@ -15,7 +15,7 @@ SELECT WHERE { ?lexeme dct:language wd:Q8798 ; wikibase:lexicalCategory wd:Q34698 ; - wikibase:lemma ?lemma . + wikibase:lemma ?adjective . OPTIONAL { ?lexeme ontolex:lexicalForm ?nominativeFeminineSingularForm .