textwrap/
word_separators.rs

1//! Functionality for finding words.
2//!
3//! In order to wrap text, we need to know where the legal break
4//! points are, i.e., where the words of the text are. This means that
5//! we need to define what a "word" is.
6//!
7//! A simple approach is to simply split the text on whitespace, but
8//! this does not work for East-Asian languages such as Chinese or
9//! Japanese where there are no spaces between words. Breaking a long
10//! sequence of emojis is another example where line breaks might be
11//! wanted even if there are no whitespace to be found.
12//!
13//! The [`WordSeparator`] enum is responsible for determining where
14//! there words are in a line of text. Please refer to the enum and
15//! its variants for more information.
16
17#[cfg(feature = "unicode-linebreak")]
18use crate::core::skip_ansi_escape_sequence;
19use crate::core::Word;
20
21/// Describes where words occur in a line of text.
22///
23/// The simplest approach is say that words are separated by one or
24/// more ASCII spaces (`' '`). This works for Western languages
25/// without emojis. A more complex approach is to use the Unicode line
26/// breaking algorithm, which finds break points in non-ASCII text.
27///
28/// The line breaks occur between words, please see
29/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30/// hyphenation of individual words.
31///
32/// # Examples
33///
34/// ```
35/// use textwrap::core::Word;
36/// use textwrap::WordSeparator::AsciiSpace;
37///
38/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40/// ```
41#[derive(Debug, Clone, Copy)]
42pub enum WordSeparator {
43    /// Find words by splitting on runs of `' '` characters.
44    ///
45    /// # Examples
46    ///
47    /// ```
48    /// use textwrap::core::Word;
49    /// use textwrap::WordSeparator::AsciiSpace;
50    ///
51    /// let words = AsciiSpace.find_words("Hello   World!").collect::<Vec<_>>();
52    /// assert_eq!(words, vec![Word::from("Hello   "),
53    ///                        Word::from("World!")]);
54    /// ```
55    AsciiSpace,
56
57    /// Split `line` into words using Unicode break properties.
58    ///
59    /// This word separator uses the Unicode line breaking algorithm
60    /// described in [Unicode Standard Annex
61    /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62    /// to break lines. There is a small difference in that the U+002D
63    /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
64    /// to allow a line break at a hyphen, use
65    /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66    /// Soft hyphens are not currently supported.
67    ///
68    /// # Examples
69    ///
70    /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71    /// breaking algorithm will find line break opportunities between
72    /// some characters with no intervening whitespace:
73    ///
74    /// ```
75    /// #[cfg(feature = "unicode-linebreak")] {
76    /// use textwrap::core::Word;
77    /// use textwrap::WordSeparator::UnicodeBreakProperties;
78    ///
79    /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚πŸ˜").collect::<Vec<_>>(),
80    ///            vec![Word::from("Emojis: "),
81    ///                 Word::from("πŸ˜‚"),
82    ///                 Word::from("😍")]);
83    ///
84    /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
85    ///            vec![Word::from("CJK: "),
86    ///                 Word::from("δ½ "),
87    ///                 Word::from("ε₯½")]);
88    /// }
89    /// ```
90    ///
91    /// A U+2060 (Word Joiner) character can be inserted if you want to
92    /// manually override the defaults and keep the characters together:
93    ///
94    /// ```
95    /// #[cfg(feature = "unicode-linebreak")] {
96    /// use textwrap::core::Word;
97    /// use textwrap::WordSeparator::UnicodeBreakProperties;
98    ///
99    /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚\u{2060}😍").collect::<Vec<_>>(),
100    ///            vec![Word::from("Emojis: "),
101    ///                 Word::from("πŸ˜‚\u{2060}😍")]);
102    /// }
103    /// ```
104    ///
105    /// The Unicode line breaking algorithm will also automatically
106    /// suppress break breaks around certain punctuation characters::
107    ///
108    /// ```
109    /// #[cfg(feature = "unicode-linebreak")] {
110    /// use textwrap::core::Word;
111    /// use textwrap::WordSeparator::UnicodeBreakProperties;
112    ///
113    /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114    ///            vec![Word::from("[ foo ] "),
115    ///                 Word::from("bar !")]);
116    /// }
117    /// ```
118    #[cfg(feature = "unicode-linebreak")]
119    UnicodeBreakProperties,
120
121    /// Find words using a custom word separator
122    Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123}
124
125impl PartialEq for WordSeparator {
126    /// Compare two word separators.
127    ///
128    /// ```
129    /// use textwrap::WordSeparator;
130    ///
131    /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
132    /// #[cfg(feature = "unicode-linebreak")] {
133    ///     assert_eq!(WordSeparator::UnicodeBreakProperties,
134    ///                WordSeparator::UnicodeBreakProperties);
135    /// }
136    /// ```
137    ///
138    /// Note that `WordSeparator::Custom` values never compare equal:
139    ///
140    /// ```
141    /// use textwrap::WordSeparator;
142    /// use textwrap::core::Word;
143    /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
144    ///     Box::new(line.split_inclusive(' ').map(Word::from))
145    /// }
146    /// assert_ne!(WordSeparator::Custom(word_separator),
147    ///            WordSeparator::Custom(word_separator));
148    /// ```
149    fn eq(&self, other: &Self) -> bool {
150        match (self, other) {
151            (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
152            #[cfg(feature = "unicode-linebreak")]
153            (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
154            (_, _) => false,
155        }
156    }
157}
158
159impl WordSeparator {
160    /// Create a new word separator.
161    ///
162    /// The best available algorithm is used by default, i.e.,
163    /// [`WordSeparator::UnicodeBreakProperties`] if available,
164    /// otherwise [`WordSeparator::AsciiSpace`].
165    pub const fn new() -> Self {
166        #[cfg(feature = "unicode-linebreak")]
167        {
168            WordSeparator::UnicodeBreakProperties
169        }
170
171        #[cfg(not(feature = "unicode-linebreak"))]
172        {
173            WordSeparator::AsciiSpace
174        }
175    }
176
177    // This function should really return impl Iterator<Item = Word>, but
178    // this isn't possible until Rust supports higher-kinded types:
179    // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
180    /// Find all words in `line`.
181    pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
182        match self {
183            WordSeparator::AsciiSpace => find_words_ascii_space(line),
184            #[cfg(feature = "unicode-linebreak")]
185            WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
186            WordSeparator::Custom(func) => func(line),
187        }
188    }
189}
190
191fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
192    let mut start = 0;
193    let mut in_whitespace = false;
194    let mut char_indices = line.char_indices();
195
196    Box::new(std::iter::from_fn(move || {
197        for (idx, ch) in char_indices.by_ref() {
198            if in_whitespace && ch != ' ' {
199                let word = Word::from(&line[start..idx]);
200                start = idx;
201                in_whitespace = ch == ' ';
202                return Some(word);
203            }
204
205            in_whitespace = ch == ' ';
206        }
207
208        if start < line.len() {
209            let word = Word::from(&line[start..]);
210            start = line.len();
211            return Some(word);
212        }
213
214        None
215    }))
216}
217
218// Strip all ANSI escape sequences from `text`.
219#[cfg(feature = "unicode-linebreak")]
220fn strip_ansi_escape_sequences(text: &str) -> String {
221    let mut result = String::with_capacity(text.len());
222
223    let mut chars = text.chars();
224    while let Some(ch) = chars.next() {
225        if skip_ansi_escape_sequence(ch, &mut chars) {
226            continue;
227        }
228        result.push(ch);
229    }
230
231    result
232}
233
234/// Soft hyphen, also knows as a β€œshy hyphen”. Should show up as β€˜-’
235/// if a line is broken at this point, and otherwise be invisible.
236/// Textwrap does not currently support breaking words at soft
237/// hyphens.
238#[cfg(feature = "unicode-linebreak")]
239const SHY: char = '\u{00ad}';
240
241/// Find words in line. ANSI escape sequences are ignored in `line`.
242#[cfg(feature = "unicode-linebreak")]
243fn find_words_unicode_break_properties<'a>(
244    line: &'a str,
245) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
246    // Construct an iterator over (original index, stripped index)
247    // tuples. We find the Unicode linebreaks on a stripped string,
248    // but we need the original indices so we can form words based on
249    // the original string.
250    let mut last_stripped_idx = 0;
251    let mut char_indices = line.char_indices();
252    let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
253        Some((orig_idx, ch)) => {
254            let stripped_idx = last_stripped_idx;
255            if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
256                last_stripped_idx += ch.len_utf8();
257            }
258            Some((orig_idx, stripped_idx))
259        }
260        None => None,
261    });
262
263    let stripped = strip_ansi_escape_sequences(line);
264    let mut opportunities = unicode_linebreak::linebreaks(&stripped)
265        .filter(|(idx, _)| {
266            #[allow(clippy::match_like_matches_macro)]
267            match &stripped[..*idx].chars().next_back() {
268                // We suppress breaks at β€˜-’ since we want to control
269                // this via the WordSplitter.
270                Some('-') => false,
271                // Soft hyphens are currently not supported since we
272                // require all `Word` fragments to be continuous in
273                // the input string.
274                Some(SHY) => false,
275                // Other breaks should be fine!
276                _ => true,
277            }
278        })
279        .collect::<Vec<_>>()
280        .into_iter();
281
282    // Remove final break opportunity, we will add it below using
283    // &line[start..]; This ensures that we correctly include a
284    // trailing ANSI escape sequence.
285    opportunities.next_back();
286
287    let mut start = 0;
288    Box::new(std::iter::from_fn(move || {
289        for (idx, _) in opportunities.by_ref() {
290            if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
291                let word = Word::from(&line[start..orig_idx]);
292                start = orig_idx;
293                return Some(word);
294            }
295        }
296
297        if start < line.len() {
298            let word = Word::from(&line[start..]);
299            start = line.len();
300            return Some(word);
301        }
302
303        None
304    }))
305}
306
307#[cfg(test)]
308mod tests {
309    use super::WordSeparator::*;
310    use super::*;
311
312    // Like assert_eq!, but the left expression is an iterator.
313    macro_rules! assert_iter_eq {
314        ($left:expr, $right:expr) => {
315            assert_eq!($left.collect::<Vec<_>>(), $right);
316        };
317    }
318
319    fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
320        words.into_iter().map(Word::from).collect()
321    }
322
323    macro_rules! test_find_words {
324        ($ascii_name:ident,
325         $unicode_name:ident,
326         $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
327            #[test]
328            fn $ascii_name() {
329                $(
330                    let expected_words = to_words($ascii_words.to_vec());
331                    let actual_words = WordSeparator::AsciiSpace
332                        .find_words($line)
333                        .collect::<Vec<_>>();
334                    assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
335                )+
336            }
337
338            #[test]
339            #[cfg(feature = "unicode-linebreak")]
340            fn $unicode_name() {
341                $(
342                    let expected_words = to_words($unicode_words.to_vec());
343                    let actual_words = WordSeparator::UnicodeBreakProperties
344                        .find_words($line)
345                        .collect::<Vec<_>>();
346                    assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
347                )+
348            }
349        };
350    }
351
352    test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
353
354    test_find_words!(
355        ascii_single_word,
356        unicode_single_word,
357        ["foo", ["foo"], ["foo"]]
358    );
359
360    test_find_words!(
361        ascii_two_words,
362        unicode_two_words,
363        ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
364    );
365
366    test_find_words!(
367        ascii_multiple_words,
368        unicode_multiple_words,
369        ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
370        ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
371    );
372
373    test_find_words!(
374        ascii_only_whitespace,
375        unicode_only_whitespace,
376        [" ", [" "], [" "]],
377        ["    ", ["    "], ["    "]]
378    );
379
380    test_find_words!(
381        ascii_inter_word_whitespace,
382        unicode_inter_word_whitespace,
383        ["foo   bar", ["foo   ", "bar"], ["foo   ", "bar"]]
384    );
385
386    test_find_words!(
387        ascii_trailing_whitespace,
388        unicode_trailing_whitespace,
389        ["foo   ", ["foo   "], ["foo   "]]
390    );
391
392    test_find_words!(
393        ascii_leading_whitespace,
394        unicode_leading_whitespace,
395        ["   foo", ["   ", "foo"], ["   ", "foo"]]
396    );
397
398    test_find_words!(
399        ascii_multi_column_char,
400        unicode_multi_column_char,
401        ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🀠
402    );
403
404    test_find_words!(
405        ascii_hyphens,
406        unicode_hyphens,
407        ["foo-bar", ["foo-bar"], ["foo-bar"]],
408        ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
409        ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
410        ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
411    );
412
413    test_find_words!(
414        ascii_newline,
415        unicode_newline,
416        ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
417    );
418
419    test_find_words!(
420        ascii_tab,
421        unicode_tab,
422        ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
423    );
424
425    test_find_words!(
426        ascii_non_breaking_space,
427        unicode_non_breaking_space,
428        ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
429    );
430
431    #[test]
432    #[cfg(unix)]
433    fn find_words_colored_text() {
434        use termion::color::{Blue, Fg, Green, Reset};
435
436        let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
437        let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
438        assert_iter_eq!(
439            AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
440            vec![Word::from(&green_hello), Word::from(&blue_world)]
441        );
442
443        #[cfg(feature = "unicode-linebreak")]
444        assert_iter_eq!(
445            UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
446            vec![Word::from(&green_hello), Word::from(&blue_world)]
447        );
448    }
449
450    #[test]
451    fn find_words_color_inside_word() {
452        let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
453        assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
454
455        #[cfg(feature = "unicode-linebreak")]
456        assert_iter_eq!(
457            UnicodeBreakProperties.find_words(text),
458            vec![Word::from(text)]
459        );
460    }
461
462    #[test]
463    fn word_separator_new() {
464        #[cfg(feature = "unicode-linebreak")]
465        assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
466
467        #[cfg(not(feature = "unicode-linebreak"))]
468        assert!(matches!(WordSeparator::new(), AsciiSpace));
469    }
470}