textwrap/
core.rs

1//! Building blocks for advanced wrapping functionality.
2//!
3//! The functions and structs in this module can be used to implement
4//! advanced wrapping functionality when [`wrap()`](crate::wrap())
5//! [`fill()`](crate::fill()) don't do what you want.
6//!
7//! In general, you want to follow these steps when wrapping
8//! something:
9//!
10//! 1. Split your input into [`Fragment`]s. These are abstract blocks
11//!    of text or content which can be wrapped into lines. See
12//!    [`WordSeparator`](crate::word_separators::WordSeparator) for
13//!    how to do this for text.
14//!
15//! 2. Potentially split your fragments into smaller pieces. This
16//!    allows you to implement things like hyphenation. If you use the
17//!    `Word` type, you can use [`WordSplitter`](crate::WordSplitter)
18//!    enum for this.
19//!
20//! 3. Potentially break apart fragments that are still too large to
21//!    fit on a single line. This is implemented in [`break_words`].
22//!
23//! 4. Finally take your fragments and put them into lines. There are
24//!    two algorithms for this in the
25//!    [`wrap_algorithms`](crate::wrap_algorithms) module:
26//!    [`wrap_optimal_fit`](crate::wrap_algorithms::wrap_optimal_fit)
27//!    and [`wrap_first_fit`](crate::wrap_algorithms::wrap_first_fit).
28//!    The former produces better line breaks, the latter is faster.
29//!
30//! 5. Iterate through the slices returned by the wrapping functions
31//!    and construct your lines of output.
32//!
33//! Please [open an issue](https://github.com/mgeisler/textwrap/) if
34//! the functionality here is not sufficient or if you have ideas for
35//! improving it. We would love to hear from you!
36
37/// The CSI or “Control Sequence Introducer” introduces an ANSI escape
38/// sequence. This is typically used for colored text and will be
39/// ignored when computing the text width.
40const CSI: (char, char) = ('\x1b', '[');
41/// The final bytes of an ANSI escape sequence must be in this range.
42const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e';
43
44/// Skip ANSI escape sequences.
45///
46/// The `ch` is the current `char`, the `chars` provide the following
47/// characters. The `chars` will be modified if `ch` is the start of
48/// an ANSI escape sequence.
49///
50/// Returns `true` if one or more chars were skipped.
51#[inline]
52pub(crate) fn skip_ansi_escape_sequence<I: Iterator<Item = char>>(ch: char, chars: &mut I) -> bool {
53    if ch != CSI.0 {
54        return false; // Nothing to skip here.
55    }
56
57    let next = chars.next();
58    if next == Some(CSI.1) {
59        // We have found the start of an ANSI escape code, typically
60        // used for colored terminal text. We skip until we find a
61        // "final byte" in the range 0x40–0x7E.
62        for ch in chars {
63            if ANSI_FINAL_BYTE.contains(&ch) {
64                break;
65            }
66        }
67    } else if next == Some(']') {
68        // We have found the start of an Operating System Command,
69        // which extends until the next sequence "\x1b\\" (the String
70        // Terminator sequence) or the BEL character. The BEL
71        // character is non-standard, but it is still used quite
72        // often, for example, by GNU ls.
73        let mut last = ']';
74        for new in chars {
75            if new == '\x07' || (new == '\\' && last == CSI.0) {
76                break;
77            }
78            last = new;
79        }
80    }
81
82    true // Indicate that some chars were skipped.
83}
84
85#[cfg(feature = "unicode-width")]
86#[inline]
87fn ch_width(ch: char) -> usize {
88    unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0)
89}
90
91/// First character which [`ch_width`] will classify as double-width.
92/// Please see [`display_width`].
93#[cfg(not(feature = "unicode-width"))]
94const DOUBLE_WIDTH_CUTOFF: char = '\u{1100}';
95
96#[cfg(not(feature = "unicode-width"))]
97#[inline]
98fn ch_width(ch: char) -> usize {
99    if ch < DOUBLE_WIDTH_CUTOFF {
100        1
101    } else {
102        2
103    }
104}
105
106/// Compute the display width of `text` while skipping over ANSI
107/// escape sequences.
108///
109/// # Examples
110///
111/// ```
112/// use textwrap::core::display_width;
113///
114/// assert_eq!(display_width("Café Plain"), 10);
115/// assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
116/// assert_eq!(display_width("\x1b]8;;http://example.com\x1b\\This is a link\x1b]8;;\x1b\\"), 14);
117/// ```
118///
119/// **Note:** When the `unicode-width` Cargo feature is disabled, the
120/// width of a `char` is determined by a crude approximation which
121/// simply counts chars below U+1100 as 1 column wide, and all other
122/// characters as 2 columns wide. With the feature enabled, function
123/// will correctly deal with [combining characters] in their
124/// decomposed form (see [Unicode equivalence]).
125///
126/// An example of a decomposed character is “é”, which can be
127/// decomposed into: “e” followed by a combining acute accent: “◌́”.
128/// Without the `unicode-width` Cargo feature, every `char` below
129/// U+1100 has a width of 1. This includes the combining accent:
130///
131/// ```
132/// use textwrap::core::display_width;
133///
134/// assert_eq!(display_width("Cafe Plain"), 10);
135/// #[cfg(feature = "unicode-width")]
136/// assert_eq!(display_width("Cafe\u{301} Plain"), 10);
137/// #[cfg(not(feature = "unicode-width"))]
138/// assert_eq!(display_width("Cafe\u{301} Plain"), 11);
139/// ```
140///
141/// ## Emojis and CJK Characters
142///
143/// Characters such as emojis and [CJK characters] used in the
144/// Chinese, Japanese, and Korean languages are seen as double-width,
145/// even if the `unicode-width` feature is disabled:
146///
147/// ```
148/// use textwrap::core::display_width;
149///
150/// assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20);
151/// assert_eq!(display_width("你好"), 4);  // “Nǐ hǎo” or “Hello” in Chinese
152/// ```
153///
154/// # Limitations
155///
156/// The displayed width of a string cannot always be computed from the
157/// string alone. This is because the width depends on the rendering
158/// engine used. This is particularly visible with [emoji modifier
159/// sequences] where a base emoji is modified with, e.g., skin tone or
160/// hair color modifiers. It is up to the rendering engine to detect
161/// this and to produce a suitable emoji.
162///
163/// A simple example is “❤️”, which consists of “❤” (U+2764: Black
164/// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
165/// itself, “❤” is a black heart, but if you follow it with the
166/// variant selector, you may get a wider red heart.
167///
168/// A more complex example would be “👨‍🦰” which should depict a man
169/// with red hair. Here the computed width is too large — and the
170/// width differs depending on the use of the `unicode-width` feature:
171///
172/// ```
173/// use textwrap::core::display_width;
174///
175/// assert_eq!("👨‍🦰".chars().collect::<Vec<char>>(), ['\u{1f468}', '\u{200d}', '\u{1f9b0}']);
176/// #[cfg(feature = "unicode-width")]
177/// assert_eq!(display_width("👨‍🦰"), 4);
178/// #[cfg(not(feature = "unicode-width"))]
179/// assert_eq!(display_width("👨‍🦰"), 6);
180/// ```
181///
182/// This happens because the grapheme consists of three code points:
183/// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰”
184/// (U+1F9B0: Red Hair). You can see them above in the test. With
185/// `unicode-width` enabled, the ZWJ is correctly seen as having zero
186/// width, without it is counted as a double-width character.
187///
188/// ## Terminal Support
189///
190/// Modern browsers typically do a great job at combining characters
191/// as shown above, but terminals often struggle more. As an example,
192/// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
193/// shows "👨‍🦰" as “👨🦰”.
194///
195/// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
196/// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
197/// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
198/// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
199pub fn display_width(text: &str) -> usize {
200    let mut chars = text.chars();
201    let mut width = 0;
202    while let Some(ch) = chars.next() {
203        if skip_ansi_escape_sequence(ch, &mut chars) {
204            continue;
205        }
206        width += ch_width(ch);
207    }
208    width
209}
210
211/// A (text) fragment denotes the unit which we wrap into lines.
212///
213/// Fragments represent an abstract _word_ plus the _whitespace_
214/// following the word. In case the word falls at the end of the line,
215/// the whitespace is dropped and a so-called _penalty_ is inserted
216/// instead (typically `"-"` if the word was hyphenated).
217///
218/// For wrapping purposes, the precise content of the word, the
219/// whitespace, and the penalty is irrelevant. All we need to know is
220/// the displayed width of each part, which this trait provides.
221pub trait Fragment: std::fmt::Debug {
222    /// Displayed width of word represented by this fragment.
223    fn width(&self) -> f64;
224
225    /// Displayed width of the whitespace that must follow the word
226    /// when the word is not at the end of a line.
227    fn whitespace_width(&self) -> f64;
228
229    /// Displayed width of the penalty that must be inserted if the
230    /// word falls at the end of a line.
231    fn penalty_width(&self) -> f64;
232}
233
234/// A piece of wrappable text, including any trailing whitespace.
235///
236/// A `Word` is an example of a [`Fragment`], so it has a width,
237/// trailing whitespace, and potentially a penalty item.
238#[derive(Debug, Copy, Clone, PartialEq, Eq)]
239pub struct Word<'a> {
240    /// Word content.
241    pub word: &'a str,
242    /// Whitespace to insert if the word does not fall at the end of a line.
243    pub whitespace: &'a str,
244    /// Penalty string to insert if the word falls at the end of a line.
245    pub penalty: &'a str,
246    // Cached width in columns.
247    pub(crate) width: usize,
248}
249
250impl std::ops::Deref for Word<'_> {
251    type Target = str;
252
253    fn deref(&self) -> &Self::Target {
254        self.word
255    }
256}
257
258impl<'a> Word<'a> {
259    /// Construct a `Word` from a string.
260    ///
261    /// A trailing stretch of `' '` is automatically taken to be the
262    /// whitespace part of the word.
263    pub fn from(word: &str) -> Word<'_> {
264        let trimmed = word.trim_end_matches(' ');
265        Word {
266            word: trimmed,
267            width: display_width(trimmed),
268            whitespace: &word[trimmed.len()..],
269            penalty: "",
270        }
271    }
272
273    /// Break this word into smaller words with a width of at most
274    /// `line_width`. The whitespace and penalty from this `Word` is
275    /// added to the last piece.
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use textwrap::core::Word;
281    /// assert_eq!(
282    ///     Word::from("Hello!  ").break_apart(3).collect::<Vec<_>>(),
283    ///     vec![Word::from("Hel"), Word::from("lo!  ")]
284    /// );
285    /// ```
286    pub fn break_apart<'b>(&'b self, line_width: usize) -> impl Iterator<Item = Word<'a>> + 'b {
287        let mut char_indices = self.word.char_indices();
288        let mut offset = 0;
289        let mut width = 0;
290
291        std::iter::from_fn(move || {
292            while let Some((idx, ch)) = char_indices.next() {
293                if skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
294                    continue;
295                }
296
297                if width > 0 && width + ch_width(ch) > line_width {
298                    let word = Word {
299                        word: &self.word[offset..idx],
300                        width: width,
301                        whitespace: "",
302                        penalty: "",
303                    };
304                    offset = idx;
305                    width = ch_width(ch);
306                    return Some(word);
307                }
308
309                width += ch_width(ch);
310            }
311
312            if offset < self.word.len() {
313                let word = Word {
314                    word: &self.word[offset..],
315                    width: width,
316                    whitespace: self.whitespace,
317                    penalty: self.penalty,
318                };
319                offset = self.word.len();
320                return Some(word);
321            }
322
323            None
324        })
325    }
326}
327
328impl Fragment for Word<'_> {
329    #[inline]
330    fn width(&self) -> f64 {
331        self.width as f64
332    }
333
334    // We assume the whitespace consist of ' ' only. This allows us to
335    // compute the display width in constant time.
336    #[inline]
337    fn whitespace_width(&self) -> f64 {
338        self.whitespace.len() as f64
339    }
340
341    // We assume the penalty is `""` or `"-"`. This allows us to
342    // compute the display width in constant time.
343    #[inline]
344    fn penalty_width(&self) -> f64 {
345        self.penalty.len() as f64
346    }
347}
348
349/// Forcibly break words wider than `line_width` into smaller words.
350///
351/// This simply calls [`Word::break_apart`] on words that are too
352/// wide. This means that no extra `'-'` is inserted, the word is
353/// simply broken into smaller pieces.
354pub fn break_words<'a, I>(words: I, line_width: usize) -> Vec<Word<'a>>
355where
356    I: IntoIterator<Item = Word<'a>>,
357{
358    let mut shortened_words = Vec::new();
359    for word in words {
360        if word.width > line_width {
361            shortened_words.extend(word.break_apart(line_width));
362        } else {
363            shortened_words.push(word);
364        }
365    }
366    shortened_words
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[cfg(feature = "unicode-width")]
374    use unicode_width::UnicodeWidthChar;
375
376    #[test]
377    fn skip_ansi_escape_sequence_works() {
378        let blue_text = "\u{1b}[34mHello\u{1b}[0m";
379        let mut chars = blue_text.chars();
380        let ch = chars.next().unwrap();
381        assert!(skip_ansi_escape_sequence(ch, &mut chars));
382        assert_eq!(chars.next(), Some('H'));
383    }
384
385    #[test]
386    fn emojis_have_correct_width() {
387        use unic_emoji_char::is_emoji;
388
389        // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
390        // blocks all have a width of 1 column. This includes
391        // characters such as '#' and '©'.
392        for ch in '\u{1}'..'\u{FF}' {
393            if is_emoji(ch) {
394                let desc = format!("{:?} U+{:04X}", ch, ch as u32);
395
396                #[cfg(feature = "unicode-width")]
397                assert_eq!(ch.width().unwrap(), 1, "char: {}", desc);
398
399                #[cfg(not(feature = "unicode-width"))]
400                assert_eq!(ch_width(ch), 1, "char: {}", desc);
401            }
402        }
403
404        // Emojis in the remaining blocks of the Basic Multilingual
405        // Plane (BMP), in the Supplementary Multilingual Plane (SMP),
406        // and in the Supplementary Ideographic Plane (SIP), are all 1
407        // or 2 columns wide when unicode-width is used, and always 2
408        // columns wide otherwise. This includes all of our favorite
409        // emojis such as 😊.
410        for ch in '\u{FF}'..'\u{2FFFF}' {
411            if is_emoji(ch) {
412                let desc = format!("{:?} U+{:04X}", ch, ch as u32);
413
414                #[cfg(feature = "unicode-width")]
415                assert!(ch.width().unwrap() <= 2, "char: {}", desc);
416
417                #[cfg(not(feature = "unicode-width"))]
418                assert_eq!(ch_width(ch), 2, "char: {}", desc);
419            }
420        }
421
422        // The remaining planes contain almost no assigned code points
423        // and thus also no emojis.
424    }
425
426    #[test]
427    fn display_width_works() {
428        assert_eq!("Café Plain".len(), 11); // “é” is two bytes
429        assert_eq!(display_width("Café Plain"), 10);
430        assert_eq!(display_width("\u{1b}[31mCafé Rouge\u{1b}[0m"), 10);
431        assert_eq!(
432            display_width("\x1b]8;;http://example.com\x1b\\This is a link\x1b]8;;\x1b\\"),
433            14
434        );
435    }
436
437    #[test]
438    fn display_width_narrow_emojis() {
439        #[cfg(feature = "unicode-width")]
440        assert_eq!(display_width("⁉"), 1);
441
442        // The ⁉ character is above DOUBLE_WIDTH_CUTOFF.
443        #[cfg(not(feature = "unicode-width"))]
444        assert_eq!(display_width("⁉"), 2);
445    }
446
447    #[test]
448    fn display_width_narrow_emojis_variant_selector() {
449        #[cfg(feature = "unicode-width")]
450        assert_eq!(display_width("⁉\u{fe0f}"), 1);
451
452        // The variant selector-16 is also counted.
453        #[cfg(not(feature = "unicode-width"))]
454        assert_eq!(display_width("⁉\u{fe0f}"), 4);
455    }
456
457    #[test]
458    fn display_width_emojis() {
459        assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20);
460    }
461}
textwrap/core.rs

textwrap/
core.rs