From b57aebe73e8146dd3448605518ece6bed1099bec Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 23 Oct 2024 19:02:01 +0400 Subject: [PATCH] fix: Ensure that `ASCII*` table formats do not use the UTF8 ellipsis char when truncating rows/cols/values --- crates/polars-core/src/fmt.rs | 131 +++++++++++++++++----------- crates/polars/src/lib.rs | 1 + py-polars/polars/config.py | 7 +- py-polars/polars/selectors.py | 2 +- py-polars/tests/unit/test_config.py | 57 ++++++++++++ 5 files changed, 141 insertions(+), 57 deletions(-) diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index c930b9e94da7..91ef3fb822f2 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -125,6 +125,13 @@ fn get_str_len_limit() -> usize { fn get_list_len_limit() -> usize { parse_env_var_limit(FMT_TABLE_CELL_LIST_LEN, DEFAULT_LIST_LEN_LIMIT) } +#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] +fn get_ellipsis() -> &'static str { + match std::env::var(FMT_TABLE_FORMATTING).as_deref().unwrap_or("") { + preset if preset.starts_with("ASCII") => "...", + _ => "…", + } +} macro_rules! format_array { ($f:ident, $a:expr, $dtype:expr, $name:expr, $array_type:expr) => {{ @@ -424,7 +431,7 @@ impl Debug for DataFrame { } } #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] -fn make_str_val(v: &str, truncate: usize) -> String { +fn make_str_val(v: &str, truncate: usize, ellipsis: &String) -> String { let v_trunc = &v[..v .char_indices() .take(truncate) @@ -434,14 +441,19 @@ fn make_str_val(v: &str, truncate: usize) -> String { if v == v_trunc { v.to_string() } else { - format!("{v_trunc}…") + format!("{v_trunc}{ellipsis}") } } #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] -fn field_to_str(f: &Field, str_truncate: usize) -> (String, usize) { - let name = make_str_val(f.name(), str_truncate); - let name_length = name.len(); +fn field_to_str( + f: &Field, + str_truncate: usize, + ellipsis: &String, + padding: usize, +) -> (String, usize) { + let name = make_str_val(f.name(), str_truncate, ellipsis); + let name_length = name.chars().count(); let mut column_name = name; if env_is_true(FMT_TABLE_HIDE_COLUMN_NAMES) { column_name = "".to_string(); @@ -473,11 +485,11 @@ fn field_to_str(f: &Field, str_truncate: usize) -> (String, usize) { format!("{column_name}{separator}{column_dtype}") }; let mut s_len = std::cmp::max(name_length, dtype_length); - let separator_length = separator.trim().len(); + let separator_length = separator.trim().chars().count(); if s_len < separator_length { s_len = separator_length; } - (s, s_len + 2) + (s, s_len + padding) } #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] @@ -487,27 +499,29 @@ fn prepare_row( n_last: usize, str_truncate: usize, max_elem_lengths: &mut [usize], + ellipsis: &String, + padding: usize, ) -> Vec { let reduce_columns = n_first + n_last < row.len(); let n_elems = n_first + n_last + reduce_columns as usize; let mut row_strings = Vec::with_capacity(n_elems); for (idx, v) in row[0..n_first].iter().enumerate() { - let elem_str = make_str_val(v, str_truncate); - let elem_len = elem_str.len() + 2; + let elem_str = make_str_val(v, str_truncate, ellipsis); + let elem_len = elem_str.chars().count() + padding; if max_elem_lengths[idx] < elem_len { max_elem_lengths[idx] = elem_len; }; row_strings.push(elem_str); } if reduce_columns { - row_strings.push("…".to_string()); - max_elem_lengths[n_first] = 3; + row_strings.push(ellipsis.to_string()); + max_elem_lengths[n_first] = ellipsis.chars().count() + padding; } let elem_offset = n_first + reduce_columns as usize; for (idx, v) in row[row.len() - n_last..].iter().enumerate() { - let elem_str = make_str_val(v, str_truncate); - let elem_len = elem_str.len() + 2; + let elem_str = make_str_val(v, str_truncate, ellipsis); + let elem_len = elem_str.chars().count() + padding; let elem_idx = elem_offset + idx; if max_elem_lengths[elem_idx] < elem_len { max_elem_lengths[elem_idx] = elem_len; @@ -542,16 +556,36 @@ impl Display for DataFrame { "The column lengths in the DataFrame are not equal." ); + let table_style = std::env::var(FMT_TABLE_FORMATTING).unwrap_or("DEFAULT".to_string()); + let is_utf8 = !table_style.starts_with("ASCII"); + let preset = match table_style.as_str() { + "ASCII_FULL" => ASCII_FULL, + "ASCII_FULL_CONDENSED" => ASCII_FULL_CONDENSED, + "ASCII_NO_BORDERS" => ASCII_NO_BORDERS, + "ASCII_BORDERS_ONLY" => ASCII_BORDERS_ONLY, + "ASCII_BORDERS_ONLY_CONDENSED" => ASCII_BORDERS_ONLY_CONDENSED, + "ASCII_HORIZONTAL_ONLY" => ASCII_HORIZONTAL_ONLY, + "ASCII_MARKDOWN" | "MARKDOWN" => ASCII_MARKDOWN, + "UTF8_FULL" => UTF8_FULL, + "UTF8_FULL_CONDENSED" => UTF8_FULL_CONDENSED, + "UTF8_NO_BORDERS" => UTF8_NO_BORDERS, + "UTF8_BORDERS_ONLY" => UTF8_BORDERS_ONLY, + "UTF8_HORIZONTAL_ONLY" => UTF8_HORIZONTAL_ONLY, + "NOTHING" => NOTHING, + _ => UTF8_FULL_CONDENSED, + }; + let ellipsis = get_ellipsis().to_string(); + let ellipsis_len = ellipsis.chars().count(); let max_n_cols = get_col_limit(); let max_n_rows = get_row_limit(); let str_truncate = get_str_len_limit(); + let padding = 2; let (n_first, n_last) = if self.width() > max_n_cols { ((max_n_cols + 1) / 2, max_n_cols / 2) } else { (self.width(), 0) }; - let reduce_columns = n_first + n_last < self.width(); let n_tbl_cols = n_first + n_last + reduce_columns as usize; let mut names = Vec::with_capacity(n_tbl_cols); @@ -559,39 +593,19 @@ impl Display for DataFrame { let fields = self.fields(); for field in fields[0..n_first].iter() { - let (s, l) = field_to_str(field, str_truncate); + let (s, l) = field_to_str(field, str_truncate, &ellipsis, padding); names.push(s); name_lengths.push(l); } if reduce_columns { - names.push("…".into()); - name_lengths.push(3); + names.push(ellipsis.clone()); + name_lengths.push(ellipsis_len); } for field in fields[self.width() - n_last..].iter() { - let (s, l) = field_to_str(field, str_truncate); + let (s, l) = field_to_str(field, str_truncate, &ellipsis, padding); names.push(s); name_lengths.push(l); } - let (preset, is_utf8) = match std::env::var(FMT_TABLE_FORMATTING) - .as_deref() - .unwrap_or("DEFAULT") - { - "ASCII_FULL" => (ASCII_FULL, false), - "ASCII_FULL_CONDENSED" => (ASCII_FULL_CONDENSED, false), - "ASCII_NO_BORDERS" => (ASCII_NO_BORDERS, false), - "ASCII_BORDERS_ONLY" => (ASCII_BORDERS_ONLY, false), - "ASCII_BORDERS_ONLY_CONDENSED" => (ASCII_BORDERS_ONLY_CONDENSED, false), - "ASCII_HORIZONTAL_ONLY" => (ASCII_HORIZONTAL_ONLY, false), - "ASCII_MARKDOWN" => (ASCII_MARKDOWN, false), - "UTF8_FULL" => (UTF8_FULL, true), - "UTF8_FULL_CONDENSED" => (UTF8_FULL_CONDENSED, true), - "UTF8_NO_BORDERS" => (UTF8_NO_BORDERS, true), - "UTF8_BORDERS_ONLY" => (UTF8_BORDERS_ONLY, true), - "UTF8_HORIZONTAL_ONLY" => (UTF8_HORIZONTAL_ONLY, true), - "NOTHING" => (NOTHING, false), - "DEFAULT" => (UTF8_FULL_CONDENSED, true), - _ => (UTF8_FULL_CONDENSED, true), - }; let mut table = Table::new(); table @@ -601,7 +615,6 @@ impl Display for DataFrame { if is_utf8 && env_is_true(FMT_TABLE_ROUNDED_CORNERS) { table.apply_modifier(UTF8_ROUND_CORNERS); } - let mut constraints = Vec::with_capacity(n_tbl_cols); let mut max_elem_lengths: Vec = vec![0; n_tbl_cols]; @@ -610,7 +623,6 @@ impl Display for DataFrame { // Truncate the table if we have more rows than the // configured maximum number of rows let mut rows = Vec::with_capacity(std::cmp::max(max_n_rows, 2)); - let half = max_n_rows / 2; let rest = max_n_rows % 2; @@ -621,13 +633,20 @@ impl Display for DataFrame { .map(|c| c.str_value(i).unwrap()) .collect(); - let row_strings = - prepare_row(row, n_first, n_last, str_truncate, &mut max_elem_lengths); - + let row_strings = prepare_row( + row, + n_first, + n_last, + str_truncate, + &mut max_elem_lengths, + &ellipsis, + padding, + ); rows.push(row_strings); } - let dots = rows[0].iter().map(|_| "…".to_string()).collect(); + let dots = vec![ellipsis.clone(); rows[0].len()]; rows.push(dots); + for i in (height - half)..height { let row = self .get_columns() @@ -635,8 +654,15 @@ impl Display for DataFrame { .map(|c| c.str_value(i).unwrap()) .collect(); - let row_strings = - prepare_row(row, n_first, n_last, str_truncate, &mut max_elem_lengths); + let row_strings = prepare_row( + row, + n_first, + n_last, + str_truncate, + &mut max_elem_lengths, + &ellipsis, + padding, + ); rows.push(row_strings); } table.add_rows(rows); @@ -654,6 +680,8 @@ impl Display for DataFrame { n_last, str_truncate, &mut max_elem_lengths, + &ellipsis, + padding, ); table.add_row(row_strings); } else { @@ -662,10 +690,9 @@ impl Display for DataFrame { } } } else if height > 0 { - let dots: Vec = self.columns.iter().map(|_| "…".to_string()).collect(); + let dots: Vec = vec![ellipsis.clone(); self.columns.len()]; table.add_row(dots); } - let tbl_fallback_width = 100; let tbl_width = std::env::var("POLARS_TABLE_WIDTH") .map(|s| { @@ -683,10 +710,10 @@ impl Display for DataFrame { lower: Width::Fixed(l as u16), upper: Width::Fixed(u as u16), }; - let min_col_width = 5; + let min_col_width = std::cmp::max(5, 3 + padding); for (idx, elem_len) in max_elem_lengths.iter().enumerate() { let mx = std::cmp::min( - str_truncate + 3, // (3 = 2 space chars of padding + ellipsis char) + str_truncate + ellipsis_len + padding, std::cmp::max(name_lengths[idx], *elem_len), ); if mx <= min_col_width { @@ -1011,7 +1038,7 @@ fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { } } if bytes.len() > width { - write!(f, "\"...")?; + write!(f, "\"…")?; } else { write!(f, "\"")?; } @@ -1138,9 +1165,7 @@ impl Series { if self.is_empty() { return "[]".to_owned(); } - let max_items = get_list_len_limit(); - match max_items { 0 => "[…]".to_owned(), _ if max_items >= self.len() => { diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 5ecc28c94c34..dba9bb39d46d 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -373,6 +373,7 @@ //! * `ASCII_BORDERS_ONLY_CONDENSED` //! * `ASCII_HORIZONTAL_ONLY` //! * `ASCII_MARKDOWN` +//! * `MARKDOWN` //! * `UTF8_FULL` //! * `UTF8_FULL_CONDENSED` //! * `UTF8_NO_BORDERS` diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index d6774cfd126f..07fe4e8002fa 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -195,7 +195,7 @@ def __init__( >>> df = pl.DataFrame({"abc": [1.0, 2.5, 5.0], "xyz": [True, False, True]}) >>> with pl.Config( ... # these options will be set for scope duration - ... tbl_formatting="ASCII_MARKDOWN", + ... tbl_formatting="MARKDOWN", ... tbl_hide_dataframe_shape=True, ... tbl_rows=10, ... ): @@ -1037,7 +1037,8 @@ def set_tbl_formatting( * "ASCII_BORDERS_ONLY": ASCII, borders only. * "ASCII_BORDERS_ONLY_CONDENSED": ASCII, borders only, dense row spacing. * "ASCII_HORIZONTAL_ONLY": ASCII, horizontal lines only. - * "ASCII_MARKDOWN": ASCII, Markdown compatible. + * "ASCII_MARKDOWN": Markdown format (ascii ellipses for truncated values). + * "MARKDOWN": Markdown format (utf8 ellipses for truncated values). * "UTF8_FULL": UTF8, with all borders and lines, including row dividers. * "UTF8_FULL_CONDENSED": Same as UTF8_FULL, but with dense row spacing. * "UTF8_NO_BORDERS": UTF8, no borders. @@ -1060,7 +1061,7 @@ def set_tbl_formatting( ... {"abc": [-2.5, 5.0], "mno": ["hello", "world"], "xyz": [True, False]} ... ) >>> with pl.Config( - ... tbl_formatting="ASCII_MARKDOWN", + ... tbl_formatting="MARKDOWN", ... tbl_hide_column_data_types=True, ... tbl_hide_dataframe_shape=True, ... ): diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 2631f222612a..4cb11506b3f6 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -508,7 +508,7 @@ def as_expr(self) -> Expr: def _re_string(string: str | Collection[str], *, escape: bool = True) -> str: """Return escaped regex, potentially representing multiple string fragments.""" if isinstance(string, str): - rx = f"{re_escape(string)}" if escape else string + rx = re_escape(string) if escape else string else: strings: list[str] = [] for st in string: diff --git a/py-polars/tests/unit/test_config.py b/py-polars/tests/unit/test_config.py index 6c74f0693caa..5c37340cc5f3 100644 --- a/py-polars/tests/unit/test_config.py +++ b/py-polars/tests/unit/test_config.py @@ -497,6 +497,16 @@ def test_shape_format_for_big_numbers() -> None: "╰─────────┴───╯" ) + pl.Config.set_tbl_formatting("ASCII_FULL_CONDENSED") + assert ( + str(df) == "shape: (1, 1_000)\n" + "+---------+-----+\n" + "| 0 (i64) | ... |\n" + "+===============+\n" + "| 1 | ... |\n" + "+---------+-----+" + ) + def test_numeric_right_alignment() -> None: pl.Config.set_tbl_cell_numeric_alignment("RIGHT") @@ -771,6 +781,53 @@ def test_set_fmt_str_lengths_invalid_length() -> None: cfg.set_fmt_str_lengths(-2) +def test_truncated_rows_cols() -> None: + df = pl.DataFrame({f"c{n}": list(range(-n, 100 - n)) for n in range(10)}) + + pl.Config.set_tbl_formatting("UTF8_BORDERS_ONLY", rounded_corners=True) + assert ( + str(df) == "shape: (100, 10)\n" + "╭───────────────────────────────────────────────────╮\n" + "│ c0 c1 c2 c3 … c6 c7 c8 c9 │\n" + "│ --- --- --- --- --- --- --- --- │\n" + "│ i64 i64 i64 i64 i64 i64 i64 i64 │\n" + "╞═══════════════════════════════════════════════════╡\n" + "│ 0 -1 -2 -3 … -6 -7 -8 -9 │\n" + "│ 1 0 -1 -2 … -5 -6 -7 -8 │\n" + "│ 2 1 0 -1 … -4 -5 -6 -7 │\n" + "│ 3 2 1 0 … -3 -4 -5 -6 │\n" + "│ 4 3 2 1 … -2 -3 -4 -5 │\n" + "│ … … … … … … … … … │\n" + "│ 95 94 93 92 … 89 88 87 86 │\n" + "│ 96 95 94 93 … 90 89 88 87 │\n" + "│ 97 96 95 94 … 91 90 89 88 │\n" + "│ 98 97 96 95 … 92 91 90 89 │\n" + "│ 99 98 97 96 … 93 92 91 90 │\n" + "╰───────────────────────────────────────────────────╯" + ) + with pl.Config(tbl_formatting="ASCII_FULL_CONDENSED"): + assert ( + str(df) == "shape: (100, 10)\n" + "+-----+-----+-----+-----+-----+-----+-----+-----+-----+\n" + "| c0 | c1 | c2 | c3 | ... | c6 | c7 | c8 | c9 |\n" + "| --- | --- | --- | --- | | --- | --- | --- | --- |\n" + "| i64 | i64 | i64 | i64 | | i64 | i64 | i64 | i64 |\n" + "+=====================================================+\n" + "| 0 | -1 | -2 | -3 | ... | -6 | -7 | -8 | -9 |\n" + "| 1 | 0 | -1 | -2 | ... | -5 | -6 | -7 | -8 |\n" + "| 2 | 1 | 0 | -1 | ... | -4 | -5 | -6 | -7 |\n" + "| 3 | 2 | 1 | 0 | ... | -3 | -4 | -5 | -6 |\n" + "| 4 | 3 | 2 | 1 | ... | -2 | -3 | -4 | -5 |\n" + "| ... | ... | ... | ... | ... | ... | ... | ... | ... |\n" + "| 95 | 94 | 93 | 92 | ... | 89 | 88 | 87 | 86 |\n" + "| 96 | 95 | 94 | 93 | ... | 90 | 89 | 88 | 87 |\n" + "| 97 | 96 | 95 | 94 | ... | 91 | 90 | 89 | 88 |\n" + "| 98 | 97 | 96 | 95 | ... | 92 | 91 | 90 | 89 |\n" + "| 99 | 98 | 97 | 96 | ... | 93 | 92 | 91 | 90 |\n" + "+-----+-----+-----+-----+-----+-----+-----+-----+-----+" + ) + + def test_warn_unstable(recwarn: pytest.WarningsRecorder) -> None: issue_unstable_warning() assert len(recwarn) == 0