diff --git a/ovos_workshop/skills/common_query_skill.py b/ovos_workshop/skills/common_query_skill.py index fb77fc50..80f2e3ed 100644 --- a/ovos_workshop/skills/common_query_skill.py +++ b/ovos_workshop/skills/common_query_skill.py @@ -13,7 +13,7 @@ from abc import abstractmethod from enum import IntEnum from os.path import dirname -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Union from ovos_bus_client import Message from ovos_utils.file_utils import resolve_resource_file @@ -26,7 +26,7 @@ class CQSMatchLevel(IntEnum): EXACT = 1 # Skill could find a specific answer for the question CATEGORY = 2 # Skill could find an answer from a category in the query - GENERAL = 3 # The query could be processed as a general quer + GENERAL = 3 # The query could be processed as a general query # Copy of CQSMatchLevel to use if the skill returns visual media @@ -34,6 +34,8 @@ class CQSMatchLevel(IntEnum): [e.name for e in CQSMatchLevel]) """these are for the confidence calculation""" +# TODO: TOPIC_MATCH_RELEVANCE and RELEVANCE_MULTIPLIER stack on the same count of +# "relevant" words. This adds too much artificial confidence (>100%) # how much each topic word is worth # when found in the answer TOPIC_MATCH_RELEVANCE = 5 @@ -60,12 +62,18 @@ class CommonQuerySkill(OVOSSkill): """ def __init__(self, *args, **kwargs): - # these should probably be configurable + # Confidence calculation numbers may be configured per-skill self.level_confidence = { CQSMatchLevel.EXACT: 0.9, CQSMatchLevel.CATEGORY: 0.6, CQSMatchLevel.GENERAL: 0.5 } + self.relevance_multiplier = TOPIC_MATCH_RELEVANCE * RELEVANCE_MULTIPLIER + self.input_consumed_multiplier = 0.1 + # TODO: The below defaults of 0.1 add ~25% for a 2-sentence response which is too much + self.response_sentences_multiplier = 0.1 + self.response_words_multiplier = 1 / WORD_COUNT_DIVISOR + super().__init__(*args, **kwargs) noise_words_filepath = f"text/{self.lang}/noise_words.list" @@ -142,7 +150,16 @@ def __handle_question_query(self, message: Message): level = result[1] answer = result[2] callback = result[3] if len(result) > 3 else {} - confidence = self.__calc_confidence(match, search_phrase, level, answer) + if isinstance(level, float): + LOG.debug(f"Confidence directly reported by skill") + confidence = level + else: + LOG.info(f"Calculating confidence for level {level}") + confidence = self.__calc_confidence(match, search_phrase, level, + answer) + if confidence > 1.0: + LOG.warning(f"Calculated confidence {confidence} > 1.0") + confidence = 1.0 callback["answer"] = answer # ensure we get it back in CQS_action self.bus.emit(message.response({"phrase": search_phrase, "skill_id": self.skill_id, @@ -156,8 +173,8 @@ def __handle_question_query(self, message: Message): "skill_id": self.skill_id, "searching": False})) - def __get_cq(self, search_phrase: str) -> (str, CQSMatchLevel, str, - Optional[dict]): + def __get_cq(self, search_phrase: str) -> (str, Union[CQSMatchLevel, float], + str, Optional[dict]): """ Invoke the CQS handler to let the skill perform its search @param search_phrase: parsed question to get an answer for @@ -201,36 +218,52 @@ def __calc_confidence(self, match: str, phrase: str, level: CQSMatchLevel, consumed_pct = len(match.split()) / len(phrase.split()) if consumed_pct > 1.0: consumed_pct = 1.0 - consumed_pct /= 10 - # bonus for more sentences - num_sentences = float(float(len(answer.split("."))) / float(10)) + # Approximate the number of sentences in the answer. A trailing `.` will + # split, so reduce length by 1. If no `.` is present, ensure we count + # any response as at least 1 sentence + num_sentences = min(len(answer.split(".")) - 1, 1) - # extract topic + # Remove articles and question words to approximate the meaningful part + # of what the skill extracted from the user input topic = self.remove_noise(match) - # calculate relevance + # Determine how many relevant words from the input are present in the + # answer + # TODO: Strip SSML from the answer here answer = answer.lower() matches = 0 for word in topic.split(' '): if answer.find(word) > -1: - matches += TOPIC_MATCH_RELEVANCE - + matches += 1 + LOG.debug(f"Answer matched {matches} words") answer_size = len(answer.split(" ")) - answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) + # Calculate relevance as the percentage of relevant input words divided + # by the length of the response. This means that an answer that only + # contains the input will have a relevance value of 1 relevance = 0.0 if answer_size > 0: relevance = float(float(matches) / float(answer_size)) - relevance = relevance * RELEVANCE_MULTIPLIER + # extra credit for more words up to a point. By default, 50 words is + # considered optimal + answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size) - # extra credit for more words up to a point - wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2 + # Calculate bonuses based on calculated values and configured weights + consumed_pct_bonus = consumed_pct * self.input_consumed_multiplier + num_sentences_bonus = num_sentences * self.response_sentences_multiplier + relevance_bonus = relevance * self.relevance_multiplier + word_count_bonus = answer_size * self.response_words_multiplier + LOG.debug(f"consumed_pct_bonus={consumed_pct_bonus}|num_sentence_bonus=" + f"{num_sentences_bonus}|relevance_bonus={relevance_bonus}|" + f"word_count_bonus={word_count_bonus}") confidence = self.level_confidence[level] + \ - consumed_pct + num_sentences + relevance + wc_mod - + consumed_pct_bonus + num_sentences_bonus + relevance_bonus + word_count_bonus + if confidence > 1: + LOG.warning(f"Calculated confidence > 1.0: {confidence}") + return 1.0 return confidence def __handle_query_classic(self, message): @@ -270,7 +303,7 @@ def __handle_query_action(self, message: Message): @abstractmethod def CQS_match_query_phrase(self, phrase: str) -> \ - Optional[Tuple[str, CQSMatchLevel, Optional[dict]]]: + Optional[Tuple[str, Union[CQSMatchLevel, float], Optional[dict]]]: """ Determine an answer to the input phrase and return match information, or `None` if no answer can be determined. diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py index ef182e86..a9eff418 100644 --- a/test/unittests/skills/test_common_query_skill.py +++ b/test/unittests/skills/test_common_query_skill.py @@ -1,4 +1,5 @@ from unittest import TestCase +from unittest.mock import Mock from ovos_utils.messagebus import FakeBus from ovos_workshop.skills.base import BaseSkill @@ -36,16 +37,39 @@ def test_handle_question_query(self): pass def test_get_cq(self): - # TODO - pass + test_phrase = "test" + mock_return = Mock() + self.skill.CQS_match_query_phrase = Mock(return_value=mock_return) + result = self.skill._CommonQuerySkill__get_cq(test_phrase) + self.skill.CQS_match_query_phrase.assert_called_once_with(test_phrase) + self.assertEqual(result, mock_return) + + self.skill.CQS_match_query_phrase.side_effect = Exception() + result = self.skill._CommonQuerySkill__get_cq(test_phrase) + self.assertIsNone(result) def test_remove_noise(self): - # TODO - pass + noisy_match = "what is a computer" + normalized = "computer" + self.assertEqual(self.skill.remove_noise(noisy_match), normalized) def test_calc_confidence(self): - # TODO - pass + generic_q = "what is coca cola" + specific_q = "how much caffeine is in coca cola" + specific_q_2 = "what is the stock price for coca cola" + cw_answer = ("The drink diet coke has 32 milligrams of caffeine in " + "250 milliliters. Provided by CaffeineWiz.") + + generic_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", generic_q, CQSMatchLevel.GENERAL, cw_answer) + exact_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", specific_q, CQSMatchLevel.EXACT, cw_answer) + low_conf = self.skill._CommonQuerySkill__calc_confidence( + "coca cola", specific_q_2, CQSMatchLevel.GENERAL, cw_answer) + + self.assertEqual(exact_conf, 1.0) + self.assertLess(generic_conf, exact_conf) + self.assertLess(low_conf, generic_conf) def test_handle_query_action(self): # TODO