Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store subword indices for quality scores in Response #357

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/tests/common-impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,11 @@ void TestSuite<Service>::qualityEstimatorWords(Ptr<TranslationModel> model) {
std::string source = readFromStdin();
const Response response = bridge_.translate(service_, model, std::move(source), responseOptions);

for (const auto &sentenceQualityEstimate : response.qualityScores) {
for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
const auto &sentenceQualityEstimate = response.qualityScores[sentenceIdx];
std::cout << "[SentenceBegin]\n";

for (const auto &wordByteRange : sentenceQualityEstimate.wordByteRanges) {
for (const auto &wordByteRange : getWordByteRanges(response, sentenceIdx)) {
const string_view word(response.target.text.data() + wordByteRange.begin, wordByteRange.size());
std::cout << word << "\n";
}
Expand Down
10 changes: 10 additions & 0 deletions src/translator/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ struct ByteRange {
bool operator==(ByteRange other) const { return begin == other.begin && end == other.end; }
};

/// A Subword range is mechanically the same as a `ByteRange`, but instead of
/// describing a span of bytes, it describes a span of Subword tokens. Using
/// `Annotation.word()` you can switch between the two.
struct SubwordRange {
size_t begin;
size_t end;
const size_t size() const { return end - begin; }
bool operator==(SubwordRange other) const { return begin == other.begin && end == other.end; }
};

class Response;
using CallbackType = std::function<void(Response &&)>;

Expand Down
22 changes: 2 additions & 20 deletions src/translator/quality_estimator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Response::SentenceQualityScore UnsupervisedQualityEstimator::computeSentenceScor
const float sentenceScore =
std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

return {wordScores, subwordToWords(wordIndices, target, sentenceIdx), sentenceScore};
return {wordScores, wordIndices, sentenceScore};
}

LogisticRegressorQualityEstimator::Matrix::Matrix(const size_t rowsParam, const size_t colsParam)
Expand Down Expand Up @@ -160,7 +160,7 @@ Response::SentenceQualityScore LogisticRegressorQualityEstimator::computeSentenc
const float sentenceScore =
std::accumulate(std::begin(wordScores), std::end(wordScores), float(0.0)) / wordScores.size();

return {wordScores, subwordToWords(wordIndices, target, sentenceIdx), sentenceScore};
return {wordScores, wordIndices, sentenceScore};
}

std::vector<float> LogisticRegressorQualityEstimator::predict(const Matrix& features) const {
Expand Down Expand Up @@ -267,22 +267,4 @@ std::vector<SubwordRange> mapWords(const std::vector<float>& logProbs, const Ann
return wordIndices;
}

std::vector<ByteRange> subwordToWords(const std::vector<SubwordRange>& wordIndices, const AnnotatedText& target,
const size_t sentenceIdx) {
std::vector<ByteRange> words;

for (const SubwordRange& wordIndice : wordIndices) {
size_t wordBegin = target.wordAsByteRange(sentenceIdx, wordIndice.begin).begin;
size_t wordEnd = target.wordAsByteRange(sentenceIdx, wordIndice.end).begin;

if (isspace(target.text.at(wordBegin))) {
++wordBegin;
}

words.emplace_back(ByteRange{wordBegin, wordEnd});
}

return words;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's happening here? There was a "words as meaningful QE units" notion before, are we losing this with the HTML integrated QE?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The QualityEstimator class still works with this assumption, subwords are grouped into words in mapWords:

std::vector<SubwordRange> mapWords(const std::vector<float>& logProbs, const AnnotatedText& target,
const size_t sentenceIdx) {
// Ignore empty target
if ((logProbs.size() < 2) || (target.numWords(sentenceIdx) == 0)) {
return {};
}
// It is expected that translated words will have at least one word
std::vector<SubwordRange> wordIndices(/*numWords=*/1);
/// The LogisticRegressorQualityEstimator model ignores the presence of the EOS token, and hence we only need to
/// iterate n-1 positions.
for (size_t subwordIdx = 0; subwordIdx < (logProbs.size() - 1); ++subwordIdx) {
ByteRange subword = target.wordAsByteRange(sentenceIdx, subwordIdx);
const char firstLetter = target.text.at(subword.begin);
// if the first character is whitespace, it's a beginning of a new word
if (isspace(firstLetter)) {
wordIndices.back().end = subwordIdx;
wordIndices.emplace_back();
wordIndices.back().begin = subwordIdx;
}
}
wordIndices.back().end = logProbs.size() - 1;
return wordIndices;
}

The bit of code I removed here turned those groupings into a single byterange. That has been moved to the WASM bindings bit since that's the last possible moment to do it.

I think most clients will find it useful to do this themselves by accessing the AnnotatedText's data. But since we don't expose those to Javascript, and I didn't want to start a half-assed attempt at reworking the Javascript API, I moved the functionality above to here:

std::vector<SentenceQualityScore> getQualityScores(const Response& response) {
std::vector<SentenceQualityScore> scores;
scores.reserve(response.qualityScores.size());
for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
std::vector<ByteRange> wordByteRanges;
wordByteRanges.reserve(response.qualityScores[sentenceIdx].wordRanges.size());
for (auto&& word : response.qualityScores[sentenceIdx].wordRanges) {
wordByteRanges.emplace_back();
wordByteRanges.back().begin = response.target.wordAsByteRange(sentenceIdx, word.begin).begin;
wordByteRanges.back().end = response.target.wordAsByteRange(sentenceIdx, word.end).begin;
}
scores.emplace_back(SentenceQualityScore{response.qualityScores[sentenceIdx].wordScores, std::move(wordByteRanges),
response.qualityScores[sentenceIdx].sentenceScore});
}
return scores;
}


} // namespace marian::bergamot
12 changes: 0 additions & 12 deletions src/translator/quality_estimator.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ class QualityEstimator {
virtual void computeQualityScores(const Histories &histories, Response &response) const = 0;
};

using SubwordRange = ByteRange;

/// Unsupervised Quality Estimator model. It uses the translator model's log probabilities (log probs) as a proxy for
/// quality scores. Then, for a given word, its quality score is computed by taking the mean of the log probs of the
/// tokens that make it up. The sentence score is the mean of all word's log probs.
Expand Down Expand Up @@ -209,14 +207,4 @@ inline std::shared_ptr<QualityEstimator> createQualityEstimator(const AlignedMem
std::vector<SubwordRange> mapWords(const std::vector<float> &logProbs, const AnnotatedText &target,
const size_t sentenceIdx);

/// Given a vector of subwordRanges, it maps the elements to be real words rather than sublevel tokens. The words are
/// represented through ByteRanges.

/// @param [in] wordIndices: A vector where each element correspond to the index of a real word and its values are
/// represented by the SubwordRanges (which are aliases of ByteRanges) which represents sublevel token positions
/// @param [in] target: AnnotatedText target value
/// @param [in] sentenceIdx: the id of a candidate sentence
std::vector<ByteRange> subwordToWords(const std::vector<SubwordRange> &wordIndices, const AnnotatedText &target,
const size_t sentenceIdx);

} // namespace marian::bergamot
18 changes: 18 additions & 0 deletions src/translator/response.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,22 @@ std::vector<Alignment> remapAlignments(const Response &first, const Response &se
return alignments;
}

std::vector<ByteRange> getWordByteRanges(const Response &response, size_t sentenceIdx) {
std::vector<ByteRange> wordByteRanges;
wordByteRanges.reserve(response.qualityScores[sentenceIdx].wordRanges.size());

for (auto &&word : response.qualityScores[sentenceIdx].wordRanges) {
size_t wordBegin = response.target.wordAsByteRange(sentenceIdx, word.begin).begin;
size_t wordEnd = response.target.wordAsByteRange(sentenceIdx, word.end).begin;

if (std::isspace(response.target.text.at(wordBegin))) {
++wordBegin;
}

wordByteRanges.emplace_back(ByteRange{wordBegin, wordEnd});
}

return wordByteRanges;
}

} // namespace marian::bergamot
6 changes: 4 additions & 2 deletions src/translator/response.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ struct Response {
struct SentenceQualityScore {
/// Quality score of each translated word
std::vector<float> wordScores;
/// Each word position in the translated text
std::vector<ByteRange> wordByteRanges;
/// Position of start and end token of each word in the translated text
std::vector<SubwordRange> wordRanges;
/// Whole sentence quality score (it is composed by the mean of its words)
float sentenceScore = 0.0;
};
Expand Down Expand Up @@ -77,6 +77,8 @@ struct Response {

std::vector<Alignment> remapAlignments(const Response &first, const Response &second);

std::vector<ByteRange> getWordByteRanges(Response const &response, size_t sentenceIdx);

} // namespace bergamot
} // namespace marian

Expand Down
25 changes: 23 additions & 2 deletions wasm/bindings/response_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,38 @@
#include "response.h"

using Response = marian::bergamot::Response;
using SentenceQualityScore = marian::bergamot::Response::SentenceQualityScore;
using ByteRange = marian::bergamot::ByteRange;

/// Same type as Response::SentenceQualityScore, except with wordByteRanges
/// instead of wordRanges.
struct SentenceQualityScore {
/// Quality score of each translated word
std::vector<float> wordScores;
/// Position of each word in the translated text
std::vector<ByteRange> wordByteRanges;
/// Whole sentence quality score (it is composed by the mean of its words)
float sentenceScore = 0.0;
};

using namespace emscripten;

// Binding code
EMSCRIPTEN_BINDINGS(byte_range) {
value_object<ByteRange>("ByteRange").field("begin", &ByteRange::begin).field("end", &ByteRange::end);
}

std::vector<SentenceQualityScore> getQualityScores(const Response& response) { return response.qualityScores; }
std::vector<SentenceQualityScore> getQualityScores(const Response& response) {
std::vector<SentenceQualityScore> scores;
scores.reserve(response.qualityScores.size());

for (size_t sentenceIdx = 0; sentenceIdx < response.qualityScores.size(); ++sentenceIdx) {
scores.emplace_back(SentenceQualityScore{response.qualityScores[sentenceIdx].wordScores,
marian::bergamot::getWordByteRanges(response, sentenceIdx),
response.qualityScores[sentenceIdx].sentenceScore});
}

return scores;
}

EMSCRIPTEN_BINDINGS(response) {
class_<Response>("Response")
Expand Down
1 change: 1 addition & 0 deletions wasm/test_page/js/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ const _parseTranslatedTextSentenceQualityScores = (vectorResponse) => {
sentenceQualityScores.push(sentenceQualityScore);
}
result.push(sentenceQualityScores);
vectorSentenceQualityScore.delete();
}
return result;
}
Expand Down