Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22940 MF2 ICU4C: Update for bidi support #3236

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 78 additions & 35 deletions icu4c/source/i18n/messageformat2_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,13 @@ static bool isContentChar(UChar32 c) {
|| inRange(c, 0xE000, 0x10FFFF);
}

// See `s` in the MessageFormat 2 grammar
// See `bidi` in the MF2 grammar
static bool isBidi(UChar32 c) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe isBidiControl might be better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 780a947

return (c == 0x061C || c == 0x200E || c == 0x200F ||
inRange(c, 0x2066, 0x2069));
}

// See `ws` in the MessageFormat 2 grammar
inline bool isWhitespace(UChar32 c) {
switch (c) {
case SPACE:
Expand Down Expand Up @@ -153,8 +159,8 @@ static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }

static bool isNameStart(UChar32 c) {
return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x061B) ||
inRange(c, 0x061D, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@catamorphism i see ALM is not namestart, but this change makes U+2000…U+200B isNameStart true. they are dashes and spaces, and not ID_Start

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I think this would be far more reliable in using a UnicodeSet. That can be created as the C++ equivalent of a static final immutable object.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@srl295 Fixed

@macchiati Done in 780a947

inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
inRange(c, 0x10000, 0xEFFFF);
}
Expand Down Expand Up @@ -347,7 +353,7 @@ option, or the optional space before an attribute.
No pre, no post.
A message may end with whitespace, so `index` may equal `len()` on exit.
*/
void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
void Parser::parseRequiredWS(UErrorCode& errorCode) {
bool sawWhitespace = false;

// The loop exits either when we consume all the input,
Expand All @@ -358,7 +364,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
// If whitespace isn't required -- or if we saw it already --
// then the caller is responsible for checking this case and
// setting an error if necessary.
if (!required || sawWhitespace) {
if (sawWhitespace) {
// Not an error.
return;
}
Expand All @@ -380,24 +386,51 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
}
}

if (!sawWhitespace && required) {
if (!sawWhitespace) {
ERROR(errorCode);
}
}

void Parser::parseOptionalBidi() {
while (true) {
if (!inBounds()) {
return;
}
if (isBidi(peek())) {
next();
} else {
break;
}
}
}

/*
No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
No pre, no post, because a message may end with whitespace
Matches `s` in the MF2 grammar
*/
void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
parseWhitespaceMaybeRequired(true, errorCode);
parseOptionalBidi();
parseRequiredWS(errorCode);
parseOptionalWhitespace();
normalizedInput += SPACE;
}

/*
No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
*/
void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
parseWhitespaceMaybeRequired(false, errorCode);
void Parser::parseOptionalWhitespace() {
while (true) {
if (!inBounds()) {
return;
}
auto cp = peek();
if (isWhitespace(cp) || isBidi(cp)) {
maybeAdvanceLine();
next();
} else {
break;
}
}
}

// Consumes a single character, signaling an error if `peek()` != `c`
Expand Down Expand Up @@ -442,11 +475,11 @@ void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode)
*/
void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
// No need for error check or bounds check before parseOptionalWhitespace
parseOptionalWhitespace(errorCode);
parseOptionalWhitespace();
// Establish precondition
CHECK_BOUNDS(errorCode);
parseToken(token, errorCode);
parseOptionalWhitespace(errorCode);
parseOptionalWhitespace();
// Guarantee postcondition
CHECK_BOUNDS(errorCode);
}
Expand All @@ -458,12 +491,12 @@ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCo
then consumes optional whitespace again
*/
void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
// No need for error check or bounds check before parseOptionalWhitespace(errorCode)
parseOptionalWhitespace(errorCode);
// No need for error check or bounds check before parseOptionalWhitespace()
parseOptionalWhitespace();
// Establish precondition
CHECK_BOUNDS(errorCode);
parseToken(c, errorCode);
parseOptionalWhitespace(errorCode);
parseOptionalWhitespace();
// Guarantee postcondition
CHECK_BOUNDS(errorCode);
}
Expand All @@ -482,11 +515,17 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {

U_ASSERT(inBounds());

if (!isNameStart(peek())) {
if (!(isNameStart(peek()) || isBidi(peek()))) {
ERROR(errorCode);
return name;
}

// name = [bidi] name-start *name-char [bidi]

// [bidi]
parseOptionalBidi();

// name-start *name-char
while (isNameChar(peek())) {
UChar32 c = peek();
name += c;
Expand All @@ -497,6 +536,10 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {
break;
}
}

// [bidi]
parseOptionalBidi();

return name;
}

Expand Down Expand Up @@ -853,7 +896,7 @@ void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode)
// about whether whitespace precedes another
// attribute, or the '=' sign
int32_t savedIndex = index;
parseOptionalWhitespace(errorCode);
parseOptionalWhitespace();

Operand rand;
if (peek() == EQUALS) {
Expand Down Expand Up @@ -1149,7 +1192,7 @@ the comment in `parseOptions()` for details.
// (the character is either the required space before an annotation, or optional
// trailing space after the literal or variable). It's still ambiguous which
// one does apply.
parseOptionalWhitespace(status);
parseOptionalWhitespace();
// Restore precondition
CHECK_BOUNDS(status);

Expand Down Expand Up @@ -1220,7 +1263,7 @@ Expression Parser::parseExpression(UErrorCode& status) {
// Parse opening brace
parseToken(LEFT_CURLY_BRACE, status);
// Optional whitespace after opening brace
parseOptionalWhitespace(status);
parseOptionalWhitespace();

Expression::Builder exprBuilder(status);
// Restore precondition
Expand Down Expand Up @@ -1263,7 +1306,7 @@ Expression Parser::parseExpression(UErrorCode& status) {

// Parse optional space
// (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
parseOptionalWhitespace(status);
parseOptionalWhitespace();

// Either an operand or operator (or both) must have been set already,
// so there can't be an error
Expand Down Expand Up @@ -1339,7 +1382,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) {
CHECK_BOUNDS(status);

parseToken(ID_INPUT, status);
parseOptionalWhitespace(status);
parseOptionalWhitespace();

// Restore precondition before calling parseExpression()
CHECK_BOUNDS(status);
Expand Down Expand Up @@ -1400,7 +1443,7 @@ void Parser::parseDeclarations(UErrorCode& status) {
// Avoid looping infinitely
CHECK_ERROR(status);

parseOptionalWhitespace(status);
parseOptionalWhitespace();
// Restore precondition
CHECK_BOUNDS(status);
}
Expand Down Expand Up @@ -1510,8 +1553,8 @@ This is addressed using "backtracking" (similarly to `parseOptions()`).

// We've seen at least one whitespace-key pair, so now we can parse
// *(s key) [s]
while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors
bool wasWhitespace = isWhitespace(peek());
while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidi(peek())) {
bool wasWhitespace = isWhitespace(peek()) || isBidi(peek());
parseRequiredWhitespace(status);
if (!wasWhitespace) {
// Avoid infinite loop when parsing something like:
Expand Down Expand Up @@ -1569,7 +1612,7 @@ Markup Parser::parseMarkup(UErrorCode& status) {
// Consume the '{'
next();
normalizedInput += LEFT_CURLY_BRACE;
parseOptionalWhitespace(status);
parseOptionalWhitespace();
bool closing = false;
switch (peek()) {
case NUMBER_SIGN: {
Expand All @@ -1596,19 +1639,19 @@ Markup Parser::parseMarkup(UErrorCode& status) {

// Parse the options, which must begin with a ' '
// if present
if (inBounds() && isWhitespace(peek())) {
if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
OptionAdder<Markup::Builder> optionAdder(builder);
parseOptions(optionAdder, status);
}

// Parse the attributes, which also must begin
// with a ' '
if (inBounds() && isWhitespace(peek())) {
if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
AttributeAdder<Markup::Builder> attrAdder(builder);
parseAttributes(attrAdder, status);
}

parseOptionalWhitespace(status);
parseOptionalWhitespace();

bool standalone = false;
// Check if this is a standalone or not
Expand Down Expand Up @@ -1656,7 +1699,7 @@ std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
isMarkup = true;
break;
}
if (!isWhitespace(c)){
if (!(isWhitespace(c) || isBidi(c))) {
break;
}
tempIndex++;
Expand Down Expand Up @@ -1740,7 +1783,7 @@ void Parser::parseSelectors(UErrorCode& status) {
// "Backtracking" is required here. It's not clear if whitespace is
// (`[s]` selector) or (`[s]` variant)
while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) {
parseOptionalWhitespace(status);
parseOptionalWhitespace();
// Restore precondition
CHECK_BOUNDS(status);
if (peek() != LEFT_CURLY_BRACE) {
Expand Down Expand Up @@ -1770,9 +1813,9 @@ void Parser::parseSelectors(UErrorCode& status) {
} \

// Parse variants
while (isWhitespace(peek()) || isKeyStart(peek())) {
while (isWhitespace(peek()) || isBidi(peek()) || isKeyStart(peek())) {
// Trailing whitespace is allowed
parseOptionalWhitespace(status);
parseOptionalWhitespace();
if (!inBounds()) {
return;
}
Expand Down Expand Up @@ -1871,7 +1914,7 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
bool complex = false;
// First, "look ahead" to determine if this is a simple or complex
// message. To do that, check the first non-whitespace character.
while (inBounds(index) && isWhitespace(peek())) {
while (inBounds(index) && (isWhitespace(peek()) || isBidi(peek()))) {
next();
}

Expand All @@ -1891,10 +1934,10 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
// Message can be empty, so we need to only look ahead
// if we know it's non-empty
if (complex) {
parseOptionalWhitespace(status);
parseOptionalWhitespace();
parseDeclarations(status);
parseBody(status);
parseOptionalWhitespace(status);
parseOptionalWhitespace();
} else {
// Simple message
// For normalization, quote the pattern
Expand Down
5 changes: 3 additions & 2 deletions icu4c/source/i18n/messageformat2_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,10 @@ namespace message2 {
void parseInputDeclaration(UErrorCode&);
void parseSelectors(UErrorCode&);

void parseWhitespaceMaybeRequired(bool, UErrorCode&);
void parseRequiredWS(UErrorCode&);
void parseRequiredWhitespace(UErrorCode&);
void parseOptionalWhitespace(UErrorCode&);
void parseOptionalBidi();
void parseOptionalWhitespace();
void parseToken(UChar32, UErrorCode&);
void parseTokenWithWhitespace(UChar32, UErrorCode&);
void parseToken(const std::u16string_view&, UErrorCode&);
Expand Down
10 changes: 0 additions & 10 deletions icu4c/source/test/intltest/messageformat2test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,6 @@

using namespace icu::message2;

/*
TODO: Tests need to be unified in a single format that
both ICU4C and ICU4J can use, rather than being embedded in code.

Tests are included in their current state to give a sense of
how much test coverage has been achieved. Most of the testing is
of the parser/serializer; the formatter needs to be tested more
thoroughly.
*/

void
TestMessageFormat2::runIndexedTest(int32_t index, UBool exec,
const char* &name, char* /*par*/) {
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/intltest/messageformat2test_read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode);

// Other tests (non-spec)
// TODO: https://github.com/unicode-org/message-format-wg/pull/902 will
// move the bidi tests into the spec
runTestsFromJsonFile(*this, "bidi.json", errorCode);
runTestsFromJsonFile(*this, "more-functions.json", errorCode);
runTestsFromJsonFile(*this, "valid-tests.json", errorCode);
runTestsFromJsonFile(*this, "resolution-errors.json", errorCode);
Expand Down
11 changes: 6 additions & 5 deletions icu4c/source/test/intltest/messageformat2test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ class TestUtils {
if (!roundTrip(in, mf.getDataModel(), out)
// For now, don't round-trip messages with these errors,
// since duplicate options are dropped
&& testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR) {
&& (testCase.expectSuccess() ||
testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR)) {
failRoundTrip(tmsg, testCase, in, out);
}

Expand Down Expand Up @@ -291,10 +292,10 @@ class TestUtils {
}
// Re-run the formatter
result = mf.formatToString(MessageArguments(testCase.getArguments(), errorCode), errorCode);
if (!testCase.outputMatches(result)) {
failWrongOutput(tmsg, testCase, result);
return;
}
}
if (!testCase.outputMatches(result)) {
failWrongOutput(tmsg, testCase, result);
return;
}
errorCode.reset();
}
Expand Down
Loading