diff --git a/doc/techref/encodings.md b/doc/techref/encodings.md index 638370e9bd3..faa866a8fcd 100644 --- a/doc/techref/encodings.md +++ b/doc/techref/encodings.md @@ -1,14 +1,12 @@ # Supported Encodings and Non-ASCII Characters -GMT supports a number of encodings and each encoding contains a set of ASCII and non-ASCII -characters. Below are some of the most common encodings and characters that are supported. +GMT supports a number of encodings and each encoding contains a set of ASCII and +non-ASCII characters. In PyGMT, you can use any of these ASCII and non-ASCII characters +in arguments and text strings. When using non-ASCII characters in PyGMT, the easiest way +is to copy and paste the character from the encoding tables below. -In PyGMT, you can use any of these ASCII and non-ASCII characters in arguments and text -strings. When using non-ASCII characters in PyGMT, the easiest way is to copy and paste -the character from the tables below. - -**Note**: The special character � (REPLACEMENT CHARACTER) is used to indicate that -the character is not defined in the encoding. +**Note**: The special character � (REPLACEMENT CHARACTER) is used to indicate +that the character is not defined in the encoding. ## Adobe ISOLatin1+ Encoding @@ -106,3 +104,27 @@ the Unicode character set. | **\35x** | ➨ | ➩ | ➪ | ➫ | ➬ | ➭ | ➮ | ➯ | | **\36x** | � | ➱ | ➲ | ➳ | ➴ | ➵ | ➶ | ➷ | | **\37x** | ➸ | ➹ | ➺ | ➻ | ➼ | ➽ | ➾ | � | + +## ISO/IEC 8859 + +GMT also supports the ISO/IEC 8859 standard for 8-bit character encodings. Refer to + for descriptions of the different parts of +the standard. + +For a list of the characters in each part of the standard, refer to the following links: + +- +- +- +- +- +- +- +- +- +- +- +- +- +- +- diff --git a/pygmt/encodings.py b/pygmt/encodings.py index 2cfda9b5728..44ed3153e85 100644 --- a/pygmt/encodings.py +++ b/pygmt/encodings.py @@ -1,13 +1,13 @@ """ -Adobe character encodings supported by GMT. +Character encodings supported by GMT. -Currently, only Adobe Symbol, Adobe ZapfDingbats, and Adobe ISOLatin1+ encodings are -supported. +Currently, Adobe Symbol, Adobe ZapfDingbats, Adobe ISOLatin1+ and ISO-8859-x (x can be +1-11, 13-16) encodings are supported. Adobe Standard encoding is not supported. -The corresponding Unicode characters in each Adobe character encoding are generated -from the mapping table and conversion script in the GMT-octal-codes -(https://github.com/seisman/GMT-octal-codes) repository. Refer to that repository for -details. +The corresponding Unicode characters in each Adobe character encoding are generated from +the mapping tables and conversion scripts in the +`GMT-octal-codes repository `__. Refer to +that repository for details. Some code points are undefined and are assigned with the replacement character (``\ufffd``). @@ -16,14 +16,17 @@ ---------- - GMT-octal-codes: https://github.com/seisman/GMT-octal-codes -- GMT official documentation: https://docs.generic-mapping-tools.org/dev/reference/octal-codes.html +- GMT documentation: https://docs.generic-mapping-tools.org/dev/reference/octal-codes.html - Adobe Postscript Language Reference: https://www.adobe.com/jp/print/postscript/pdfs/PLRM.pdf -- ISOLatin1+: https://en.wikipedia.org/wiki/PostScript_Latin_1_Encoding +- Adobe ISOLatin1+: https://en.wikipedia.org/wiki/PostScript_Latin_1_Encoding - Adobe Symbol: https://en.wikipedia.org/wiki/Symbol_(typeface) -- Zapf Dingbats: https://en.wikipedia.org/wiki/Zapf_Dingbats +- Adobe ZapfDingbats: https://en.wikipedia.org/wiki/Zapf_Dingbats - Adobe Glyph List: https://github.com/adobe-type-tools/agl-aglfn +- ISO-8859: https://en.wikipedia.org/wiki/ISO/IEC_8859 """ +import codecs + # Dictionary of character mappings for different encodings. charset: dict = {} @@ -129,3 +132,12 @@ strict=False, ) ) + +# ISO-8859-x charsets and x can be 1-11, 13-16. +for i in range(1, 17): + if i == 12: # ISO-8859-12 was abandoned. + continue + charset[f"ISO-8859-{i}"] = { + code: codecs.decode(bytes([code]), f"iso8859_{i}", errors="replace") + for code in [*range(0o040, 0o200), *range(0o240, 0o400)] + } diff --git a/pygmt/helpers/__init__.py b/pygmt/helpers/__init__.py index 862abbbdd64..08583896b6c 100644 --- a/pygmt/helpers/__init__.py +++ b/pygmt/helpers/__init__.py @@ -15,6 +15,7 @@ unique_name, ) from pygmt.helpers.utils import ( + _check_encoding, _validate_data_input, args_in_kwargs, build_arg_list, diff --git a/pygmt/helpers/utils.py b/pygmt/helpers/utils.py index 2e981266575..cd54d6fc18e 100644 --- a/pygmt/helpers/utils.py +++ b/pygmt/helpers/utils.py @@ -115,6 +115,78 @@ def _validate_data_input( raise GMTInvalidInput("data must provide x, y, and z columns.") +def _check_encoding( + argstr: str, +) -> Literal[ + "ascii", + "ISOLatin1+", + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-10", + "ISO-8859-11", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", +]: + """ + Check the charset encoding of a string. + + All characters in the string must be in the same charset encoding, otherwise the + default ``ISOLatin1+`` encoding is returned. Characters in the Adobe Symbol and + ZapfDingbats encodings are also checked because they're independent on the choice of + encodings. + + Parameters + ---------- + argstr + The string to be checked. + + Returns + ------- + encoding + The encoding of the string. + + Examples + -------- + >>> _check_encoding("123ABC+-?!") # ASCII characters only + 'ascii' + >>> _check_encoding("12AB±β①②") # Characters in ISOLatin1+ + 'ISOLatin1+' + >>> _check_encoding("12ABāáâãäåβ①②") # Characters in ISO-8859-4 + 'ISO-8859-4' + >>> _check_encoding("12ABŒā") # Mix characters in ISOLatin1+ (Œ) and ISO-8859-4 (ā) + 'ISOLatin1+' + >>> _check_encoding("123AB中文") # Characters not in any charset encoding + 'ISOLatin1+' + """ + # Return "ascii" if the string only contains ASCII characters. + if all(32 <= ord(c) <= 126 for c in argstr): + return "ascii" + # Loop through all supported encodings and check if all characters in the string + # are in the charset of the encoding. If all characters are in the charset, return + # the encoding. The ISOLatin1+ encoding is checked first because it is the default + # and most common encoding. + adobe_chars = set(charset["Symbol"].values()) | set( + charset["ZapfDingbats"].values() + ) + for encoding in ["ISOLatin1+"] + [f"ISO-8859-{i}" for i in range(1, 17)]: + if encoding == "ISO-8859-12": # ISO-8859-12 was abandoned. Skip it. + continue + if all(c in (set(charset[encoding].values()) | adobe_chars) for c in argstr): + return encoding # type: ignore[return-value] + # Return the "ISOLatin1+" encoding if the string contains characters from multiple + # charset encodings or contains characters that are not in any charset encoding. + return "ISOLatin1+" + + def data_kind( data: Any = None, required: bool = True ) -> Literal["arg", "file", "geojson", "grid", "image", "matrix", "vectors"]: @@ -192,17 +264,41 @@ def data_kind( return kind -def non_ascii_to_octal(argstr: str) -> str: +def non_ascii_to_octal( + argstr: str, + encoding: Literal[ + "ascii", + "ISOLatin1+", + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-10", + "ISO-8859-11", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", + ] = "ISOLatin1+", +) -> str: r""" Translate non-ASCII characters to their corresponding octal codes. - Currently, only characters in the ISOLatin1+ charset and Symbol/ZapfDingbats fonts - are supported. + Currently, only non-ASCII characters in the Adobe ISOLatin1+, Adobe Symbol, Adobe + ZapfDingbats, and ISO-8850-x (x can be in 1-11, 13-17) encodings are supported. + The Adobe Standard encoding is not supported yet. Parameters ---------- argstr The string to be translated. + encoding + The encoding of characters in the string. Returns ------- @@ -219,9 +315,11 @@ def non_ascii_to_octal(argstr: str) -> str: '@%34%\\041@%%@%34%\\176@%%@%34%\\241@%%@%34%\\376@%%' >>> non_ascii_to_octal("ABC ±120° DEF α ♥") 'ABC \\261120\\260 DEF @~\\141@~ @%34%\\252@%%' + >>> non_ascii_to_octal("12ABāáâãäåβ①②", encoding="ISO-8859-4") + '12AB\\340\\341\\342\\343\\344\\345@~\\142@~@%34%\\254@%%@%34%\\255@%%' """ # noqa: RUF002 - # Return the string if it only contains printable ASCII characters from 32 to 126. - if all(32 <= ord(c) <= 126 for c in argstr): + # Return the input string if it only contains ASCII characters. + if encoding == "ascii" or all(32 <= ord(c) <= 126 for c in argstr): return argstr # Dictionary mapping non-ASCII characters to octal codes @@ -232,15 +330,15 @@ def non_ascii_to_octal(argstr: str) -> str: mapping.update( {c: f"@%34%\\{i:03o}@%%" for i, c in charset["ZapfDingbats"].items()} ) - # Adobe ISOLatin1+ charset. Put at the end. - mapping.update({c: f"\\{i:03o}" for i, c in charset["ISOLatin1+"].items()}) + # ISOLatin1+ or ISO-8859-x charset. + mapping.update({c: f"\\{i:03o}" for i, c in charset[encoding].items()}) # Remove any printable characters mapping = {k: v for k, v in mapping.items() if k not in string.printable} return argstr.translate(str.maketrans(mapping)) -def build_arg_list( +def build_arg_list( # noqa: PLR0912 kwdict: dict[str, Any], confdict: dict[str, str] | None = None, infile: str | pathlib.PurePath | Sequence[str | pathlib.PurePath] | None = None, @@ -310,6 +408,10 @@ def build_arg_list( ... ) ... ) ['f1.txt', 'f2.txt', '-A0', '-B', '--FORMAT_DATE_MAP=o dd', '->out.txt'] + >>> build_arg_list(dict(B="12ABāβ①②")) + ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-4'] + >>> build_arg_list(dict(B="12ABāβ①②"), confdict=dict(PS_CHAR_ENCODING="ISO-8859-5")) + ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-5'] >>> print(build_arg_list(dict(R="1/2/3/4", J="X4i", watre=True))) Traceback (most recent call last): ... @@ -324,11 +426,22 @@ def build_arg_list( elif value is True: gmt_args.append(f"-{key}") elif is_nonstr_iter(value): - gmt_args.extend(non_ascii_to_octal(f"-{key}{_value}") for _value in value) + gmt_args.extend(f"-{key}{_value}" for _value in value) else: - gmt_args.append(non_ascii_to_octal(f"-{key}{value}")) + gmt_args.append(f"-{key}{value}") + + # Convert non-ASCII characters (if any) in the arguments to octal codes + encoding = _check_encoding("".join(gmt_args)) + if encoding != "ascii": + gmt_args = [non_ascii_to_octal(arg, encoding=encoding) for arg in gmt_args] gmt_args = sorted(gmt_args) + # Set --PS_CHAR_ENCODING=encoding if necessary + if encoding not in {"ascii", "ISOLatin1+"} and not ( + confdict and "PS_CHAR_ENCODING" in confdict + ): + gmt_args.append(f"--PS_CHAR_ENCODING={encoding}") + if confdict: gmt_args.extend(f"--{key}={value}" for key, value in confdict.items()) diff --git a/pygmt/src/text.py b/pygmt/src/text.py index 484f885997a..3e72b30b328 100644 --- a/pygmt/src/text.py +++ b/pygmt/src/text.py @@ -6,6 +6,7 @@ from pygmt.clib import Session from pygmt.exceptions import GMTInvalidInput from pygmt.helpers import ( + _check_encoding, build_arg_list, data_kind, fmt_docstring, @@ -59,13 +60,12 @@ def text_( # noqa: PLR0912 - ``x``/``y``, and ``text`` - ``position`` and ``text`` - The text strings passed via the ``text`` parameter can contain ASCII - characters and non-ASCII characters defined in the ISOLatin1+ encoding - (i.e., IEC_8859-1), and the Symbol and ZapfDingbats character sets. - See :gmt-docs:`reference/octal-codes.html` for the full list of supported - non-ASCII characters. + The text strings passed via the ``text`` parameter can contain ASCII characters and + non-ASCII characters defined in the Adobe ISOLatin1+, Adobe Symbol, Adobe + ZapfDingbats and ISO-8859-x (x can be 1-11, 13-16) encodings. Refer to + :doc:`techref/encodings` for the full list of supported non-ASCII characters. - Full option list at :gmt-docs:`text.html` + Full option list at :gmt-docs:`text.html`. {aliases} @@ -226,13 +226,24 @@ def text_( # noqa: PLR0912 kwargs["t"] = "" # Append text at last column. Text must be passed in as str type. + confdict = {} if kind == "vectors": - extra_arrays.append( - np.vectorize(non_ascii_to_octal)(np.atleast_1d(text).astype(str)) - ) + text = np.atleast_1d(text).astype(str) + encoding = _check_encoding("".join(text)) + if encoding != "ascii": + text = np.vectorize(non_ascii_to_octal, excluded="encoding")( + text, encoding=encoding + ) + extra_arrays.append(text) + + if encoding not in {"ascii", "ISOLatin1+"}: + confdict = {"PS_CHAR_ENCODING": encoding} with Session() as lib: with lib.virtualfile_in( check_kind="vector", data=textfiles, x=x, y=y, extra_arrays=extra_arrays ) as vintbl: - lib.call_module(module="text", args=build_arg_list(kwargs, infile=vintbl)) + lib.call_module( + module="text", + args=build_arg_list(kwargs, infile=vintbl, confdict=confdict), + ) diff --git a/pygmt/tests/baseline/test_text_nonascii_iso8859.png.dvc b/pygmt/tests/baseline/test_text_nonascii_iso8859.png.dvc new file mode 100644 index 00000000000..ee0f41600e9 --- /dev/null +++ b/pygmt/tests/baseline/test_text_nonascii_iso8859.png.dvc @@ -0,0 +1,5 @@ +outs: +- md5: a0f35a1d58c95e6589c7397e7660e946 + size: 17089 + hash: md5 + path: test_text_nonascii_iso8859.png diff --git a/pygmt/tests/test_text.py b/pygmt/tests/test_text.py index 8543734bb30..64781c514bc 100644 --- a/pygmt/tests/test_text.py +++ b/pygmt/tests/test_text.py @@ -434,3 +434,16 @@ def test_text_quotation_marks(): fig.basemap(projection="X4c/2c", region=[0, 4, 0, 2], frame=0) fig.text(x=2, y=1, text='\\234 ‘ ’ " “ ”', font="20p") # noqa: RUF001 return fig + + +@pytest.mark.mpl_image_compare +def test_text_nonascii_iso8859(): + """ + Test passing text strings with non-ascii characters in ISO-8859-4 encoding. + """ + fig = Figure() + fig.basemap(region=[0, 10, 0, 10], projection="X10c", frame=["WSEN+tAāáâãäåB"]) + fig.text(position="TL", text="position-text:1ÉĘËĖ2") + fig.text(x=1, y=1, text="xytext:1éęëė2") + fig.text(x=[5, 5], y=[3, 5], text=["xytext1:ųúûüũūαζ∆❡", "xytext2:íîī∑π∇✉"]) + return fig