textwrap/word_separators.rs
1//! Functionality for finding words.
2//!
3//! In order to wrap text, we need to know where the legal break
4//! points are, i.e., where the words of the text are. This means that
5//! we need to define what a "word" is.
6//!
7//! A simple approach is to simply split the text on whitespace, but
8//! this does not work for East-Asian languages such as Chinese or
9//! Japanese where there are no spaces between words. Breaking a long
10//! sequence of emojis is another example where line breaks might be
11//! wanted even if there are no whitespace to be found.
12//!
13//! The [`WordSeparator`] enum is responsible for determining where
14//! there words are in a line of text. Please refer to the enum and
15//! its variants for more information.
16
17#[cfg(feature = "unicode-linebreak")]
18use crate::core::skip_ansi_escape_sequence;
19use crate::core::Word;
20
21/// Describes where words occur in a line of text.
22///
23/// The simplest approach is say that words are separated by one or
24/// more ASCII spaces (`' '`). This works for Western languages
25/// without emojis. A more complex approach is to use the Unicode line
26/// breaking algorithm, which finds break points in non-ASCII text.
27///
28/// The line breaks occur between words, please see
29/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30/// hyphenation of individual words.
31///
32/// # Examples
33///
34/// ```
35/// use textwrap::core::Word;
36/// use textwrap::WordSeparator::AsciiSpace;
37///
38/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40/// ```
41#[derive(Debug, Clone, Copy)]
42pub enum WordSeparator {
43 /// Find words by splitting on runs of `' '` characters.
44 ///
45 /// # Examples
46 ///
47 /// ```
48 /// use textwrap::core::Word;
49 /// use textwrap::WordSeparator::AsciiSpace;
50 ///
51 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
52 /// assert_eq!(words, vec![Word::from("Hello "),
53 /// Word::from("World!")]);
54 /// ```
55 AsciiSpace,
56
57 /// Split `line` into words using Unicode break properties.
58 ///
59 /// This word separator uses the Unicode line breaking algorithm
60 /// described in [Unicode Standard Annex
61 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62 /// to break lines. There is a small difference in that the U+002D
63 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) donβt create a line break:
64 /// to allow a line break at a hyphen, use
65 /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66 /// Soft hyphens are not currently supported.
67 ///
68 /// # Examples
69 ///
70 /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71 /// breaking algorithm will find line break opportunities between
72 /// some characters with no intervening whitespace:
73 ///
74 /// ```
75 /// #[cfg(feature = "unicode-linebreak")] {
76 /// use textwrap::core::Word;
77 /// use textwrap::WordSeparator::UnicodeBreakProperties;
78 ///
79 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ππ").collect::<Vec<_>>(),
80 /// vec![Word::from("Emojis: "),
81 /// Word::from("π"),
82 /// Word::from("π")]);
83 ///
84 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
85 /// vec![Word::from("CJK: "),
86 /// Word::from("δ½ "),
87 /// Word::from("ε₯½")]);
88 /// }
89 /// ```
90 ///
91 /// A U+2060 (Word Joiner) character can be inserted if you want to
92 /// manually override the defaults and keep the characters together:
93 ///
94 /// ```
95 /// #[cfg(feature = "unicode-linebreak")] {
96 /// use textwrap::core::Word;
97 /// use textwrap::WordSeparator::UnicodeBreakProperties;
98 ///
99 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: π\u{2060}π").collect::<Vec<_>>(),
100 /// vec![Word::from("Emojis: "),
101 /// Word::from("π\u{2060}π")]);
102 /// }
103 /// ```
104 ///
105 /// The Unicode line breaking algorithm will also automatically
106 /// suppress break breaks around certain punctuation characters::
107 ///
108 /// ```
109 /// #[cfg(feature = "unicode-linebreak")] {
110 /// use textwrap::core::Word;
111 /// use textwrap::WordSeparator::UnicodeBreakProperties;
112 ///
113 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114 /// vec![Word::from("[ foo ] "),
115 /// Word::from("bar !")]);
116 /// }
117 /// ```
118 #[cfg(feature = "unicode-linebreak")]
119 UnicodeBreakProperties,
120
121 /// Find words using a custom word separator
122 Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123}
124
125impl PartialEq for WordSeparator {
126 /// Compare two word separators.
127 ///
128 /// ```
129 /// use textwrap::WordSeparator;
130 ///
131 /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
132 /// #[cfg(feature = "unicode-linebreak")] {
133 /// assert_eq!(WordSeparator::UnicodeBreakProperties,
134 /// WordSeparator::UnicodeBreakProperties);
135 /// }
136 /// ```
137 ///
138 /// Note that `WordSeparator::Custom` values never compare equal:
139 ///
140 /// ```
141 /// use textwrap::WordSeparator;
142 /// use textwrap::core::Word;
143 /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
144 /// Box::new(line.split_inclusive(' ').map(Word::from))
145 /// }
146 /// assert_ne!(WordSeparator::Custom(word_separator),
147 /// WordSeparator::Custom(word_separator));
148 /// ```
149 fn eq(&self, other: &Self) -> bool {
150 match (self, other) {
151 (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
152 #[cfg(feature = "unicode-linebreak")]
153 (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
154 (_, _) => false,
155 }
156 }
157}
158
159impl WordSeparator {
160 /// Create a new word separator.
161 ///
162 /// The best available algorithm is used by default, i.e.,
163 /// [`WordSeparator::UnicodeBreakProperties`] if available,
164 /// otherwise [`WordSeparator::AsciiSpace`].
165 pub const fn new() -> Self {
166 #[cfg(feature = "unicode-linebreak")]
167 {
168 WordSeparator::UnicodeBreakProperties
169 }
170
171 #[cfg(not(feature = "unicode-linebreak"))]
172 {
173 WordSeparator::AsciiSpace
174 }
175 }
176
177 // This function should really return impl Iterator<Item = Word>, but
178 // this isn't possible until Rust supports higher-kinded types:
179 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
180 /// Find all words in `line`.
181 pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
182 match self {
183 WordSeparator::AsciiSpace => find_words_ascii_space(line),
184 #[cfg(feature = "unicode-linebreak")]
185 WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
186 WordSeparator::Custom(func) => func(line),
187 }
188 }
189}
190
191fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
192 let mut start = 0;
193 let mut in_whitespace = false;
194 let mut char_indices = line.char_indices();
195
196 Box::new(std::iter::from_fn(move || {
197 for (idx, ch) in char_indices.by_ref() {
198 if in_whitespace && ch != ' ' {
199 let word = Word::from(&line[start..idx]);
200 start = idx;
201 in_whitespace = ch == ' ';
202 return Some(word);
203 }
204
205 in_whitespace = ch == ' ';
206 }
207
208 if start < line.len() {
209 let word = Word::from(&line[start..]);
210 start = line.len();
211 return Some(word);
212 }
213
214 None
215 }))
216}
217
218// Strip all ANSI escape sequences from `text`.
219#[cfg(feature = "unicode-linebreak")]
220fn strip_ansi_escape_sequences(text: &str) -> String {
221 let mut result = String::with_capacity(text.len());
222
223 let mut chars = text.chars();
224 while let Some(ch) = chars.next() {
225 if skip_ansi_escape_sequence(ch, &mut chars) {
226 continue;
227 }
228 result.push(ch);
229 }
230
231 result
232}
233
234/// Soft hyphen, also knows as a βshy hyphenβ. Should show up as β-β
235/// if a line is broken at this point, and otherwise be invisible.
236/// Textwrap does not currently support breaking words at soft
237/// hyphens.
238#[cfg(feature = "unicode-linebreak")]
239const SHY: char = '\u{00ad}';
240
241/// Find words in line. ANSI escape sequences are ignored in `line`.
242#[cfg(feature = "unicode-linebreak")]
243fn find_words_unicode_break_properties<'a>(
244 line: &'a str,
245) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
246 // Construct an iterator over (original index, stripped index)
247 // tuples. We find the Unicode linebreaks on a stripped string,
248 // but we need the original indices so we can form words based on
249 // the original string.
250 let mut last_stripped_idx = 0;
251 let mut char_indices = line.char_indices();
252 let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
253 Some((orig_idx, ch)) => {
254 let stripped_idx = last_stripped_idx;
255 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
256 last_stripped_idx += ch.len_utf8();
257 }
258 Some((orig_idx, stripped_idx))
259 }
260 None => None,
261 });
262
263 let stripped = strip_ansi_escape_sequences(line);
264 let mut opportunities = unicode_linebreak::linebreaks(&stripped)
265 .filter(|(idx, _)| {
266 #[allow(clippy::match_like_matches_macro)]
267 match &stripped[..*idx].chars().next_back() {
268 // We suppress breaks at β-β since we want to control
269 // this via the WordSplitter.
270 Some('-') => false,
271 // Soft hyphens are currently not supported since we
272 // require all `Word` fragments to be continuous in
273 // the input string.
274 Some(SHY) => false,
275 // Other breaks should be fine!
276 _ => true,
277 }
278 })
279 .collect::<Vec<_>>()
280 .into_iter();
281
282 // Remove final break opportunity, we will add it below using
283 // &line[start..]; This ensures that we correctly include a
284 // trailing ANSI escape sequence.
285 opportunities.next_back();
286
287 let mut start = 0;
288 Box::new(std::iter::from_fn(move || {
289 for (idx, _) in opportunities.by_ref() {
290 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
291 let word = Word::from(&line[start..orig_idx]);
292 start = orig_idx;
293 return Some(word);
294 }
295 }
296
297 if start < line.len() {
298 let word = Word::from(&line[start..]);
299 start = line.len();
300 return Some(word);
301 }
302
303 None
304 }))
305}
306
307#[cfg(test)]
308mod tests {
309 use super::WordSeparator::*;
310 use super::*;
311
312 // Like assert_eq!, but the left expression is an iterator.
313 macro_rules! assert_iter_eq {
314 ($left:expr, $right:expr) => {
315 assert_eq!($left.collect::<Vec<_>>(), $right);
316 };
317 }
318
319 fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
320 words.into_iter().map(Word::from).collect()
321 }
322
323 macro_rules! test_find_words {
324 ($ascii_name:ident,
325 $unicode_name:ident,
326 $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
327 #[test]
328 fn $ascii_name() {
329 $(
330 let expected_words = to_words($ascii_words.to_vec());
331 let actual_words = WordSeparator::AsciiSpace
332 .find_words($line)
333 .collect::<Vec<_>>();
334 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
335 )+
336 }
337
338 #[test]
339 #[cfg(feature = "unicode-linebreak")]
340 fn $unicode_name() {
341 $(
342 let expected_words = to_words($unicode_words.to_vec());
343 let actual_words = WordSeparator::UnicodeBreakProperties
344 .find_words($line)
345 .collect::<Vec<_>>();
346 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
347 )+
348 }
349 };
350 }
351
352 test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
353
354 test_find_words!(
355 ascii_single_word,
356 unicode_single_word,
357 ["foo", ["foo"], ["foo"]]
358 );
359
360 test_find_words!(
361 ascii_two_words,
362 unicode_two_words,
363 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
364 );
365
366 test_find_words!(
367 ascii_multiple_words,
368 unicode_multiple_words,
369 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
370 ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
371 );
372
373 test_find_words!(
374 ascii_only_whitespace,
375 unicode_only_whitespace,
376 [" ", [" "], [" "]],
377 [" ", [" "], [" "]]
378 );
379
380 test_find_words!(
381 ascii_inter_word_whitespace,
382 unicode_inter_word_whitespace,
383 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
384 );
385
386 test_find_words!(
387 ascii_trailing_whitespace,
388 unicode_trailing_whitespace,
389 ["foo ", ["foo "], ["foo "]]
390 );
391
392 test_find_words!(
393 ascii_leading_whitespace,
394 unicode_leading_whitespace,
395 [" foo", [" ", "foo"], [" ", "foo"]]
396 );
397
398 test_find_words!(
399 ascii_multi_column_char,
400 unicode_multi_column_char,
401 ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji π€
402 );
403
404 test_find_words!(
405 ascii_hyphens,
406 unicode_hyphens,
407 ["foo-bar", ["foo-bar"], ["foo-bar"]],
408 ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
409 ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
410 ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
411 );
412
413 test_find_words!(
414 ascii_newline,
415 unicode_newline,
416 ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
417 );
418
419 test_find_words!(
420 ascii_tab,
421 unicode_tab,
422 ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
423 );
424
425 test_find_words!(
426 ascii_non_breaking_space,
427 unicode_non_breaking_space,
428 ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
429 );
430
431 #[test]
432 #[cfg(unix)]
433 fn find_words_colored_text() {
434 use termion::color::{Blue, Fg, Green, Reset};
435
436 let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
437 let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
438 assert_iter_eq!(
439 AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
440 vec![Word::from(&green_hello), Word::from(&blue_world)]
441 );
442
443 #[cfg(feature = "unicode-linebreak")]
444 assert_iter_eq!(
445 UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
446 vec![Word::from(&green_hello), Word::from(&blue_world)]
447 );
448 }
449
450 #[test]
451 fn find_words_color_inside_word() {
452 let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
453 assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
454
455 #[cfg(feature = "unicode-linebreak")]
456 assert_iter_eq!(
457 UnicodeBreakProperties.find_words(text),
458 vec![Word::from(text)]
459 );
460 }
461
462 #[test]
463 fn word_separator_new() {
464 #[cfg(feature = "unicode-linebreak")]
465 assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
466
467 #[cfg(not(feature = "unicode-linebreak"))]
468 assert!(matches!(WordSeparator::new(), AsciiSpace));
469 }
470}