bumpalo/collections/str/
lossy.rs

1// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use crate::collections::str as core_str;
12use core::char;
13use core::fmt;
14use core::fmt::Write;
15use core::str;
16
17/// Lossy UTF-8 string.
18pub struct Utf8Lossy<'a> {
19    bytes: &'a [u8],
20}
21
22impl<'a> Utf8Lossy<'a> {
23    pub fn from_bytes(bytes: &'a [u8]) -> Utf8Lossy<'a> {
24        Utf8Lossy { bytes }
25    }
26
27    pub fn chunks(&self) -> Utf8LossyChunksIter<'a> {
28        Utf8LossyChunksIter {
29            source: &self.bytes,
30        }
31    }
32}
33
34/// Iterator over lossy UTF-8 string
35#[allow(missing_debug_implementations)]
36pub struct Utf8LossyChunksIter<'a> {
37    source: &'a [u8],
38}
39
40#[derive(PartialEq, Eq, Debug)]
41pub struct Utf8LossyChunk<'a> {
42    /// Sequence of valid chars.
43    /// Can be empty between broken UTF-8 chars.
44    pub valid: &'a str,
45    /// Single broken char, empty if none.
46    /// Empty iff iterator item is last.
47    pub broken: &'a [u8],
48}
49
50impl<'a> Iterator for Utf8LossyChunksIter<'a> {
51    type Item = Utf8LossyChunk<'a>;
52
53    fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
54        if self.source.is_empty() {
55            return None;
56        }
57
58        const TAG_CONT_U8: u8 = 128;
59        fn unsafe_get(xs: &[u8], i: usize) -> u8 {
60            unsafe { *xs.get_unchecked(i) }
61        }
62        fn safe_get(xs: &[u8], i: usize) -> u8 {
63            if i >= xs.len() {
64                0
65            } else {
66                unsafe_get(xs, i)
67            }
68        }
69
70        let mut i = 0;
71        while i < self.source.len() {
72            let i_ = i;
73
74            let byte = unsafe_get(self.source, i);
75            i += 1;
76
77            if byte < 128 {
78            } else {
79                let w = core_str::utf8_char_width(byte);
80
81                macro_rules! error {
82                    () => {{
83                        unsafe {
84                            let r = Utf8LossyChunk {
85                                valid: str::from_utf8_unchecked(&self.source[0..i_]),
86                                broken: &self.source[i_..i],
87                            };
88                            self.source = &self.source[i..];
89                            return Some(r);
90                        }
91                    }};
92                }
93
94                match w {
95                    2 => {
96                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
97                            error!();
98                        }
99                        i += 1;
100                    }
101                    3 => {
102                        match (byte, safe_get(self.source, i)) {
103                            (0xE0, 0xA0..=0xBF) => (),
104                            (0xE1..=0xEC, 0x80..=0xBF) => (),
105                            (0xED, 0x80..=0x9F) => (),
106                            (0xEE..=0xEF, 0x80..=0xBF) => (),
107                            _ => {
108                                error!();
109                            }
110                        }
111                        i += 1;
112                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
113                            error!();
114                        }
115                        i += 1;
116                    }
117                    4 => {
118                        match (byte, safe_get(self.source, i)) {
119                            (0xF0, 0x90..=0xBF) => (),
120                            (0xF1..=0xF3, 0x80..=0xBF) => (),
121                            (0xF4, 0x80..=0x8F) => (),
122                            _ => {
123                                error!();
124                            }
125                        }
126                        i += 1;
127                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
128                            error!();
129                        }
130                        i += 1;
131                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
132                            error!();
133                        }
134                        i += 1;
135                    }
136                    _ => {
137                        error!();
138                    }
139                }
140            }
141        }
142
143        let r = Utf8LossyChunk {
144            valid: unsafe { str::from_utf8_unchecked(self.source) },
145            broken: &[],
146        };
147        self.source = &[];
148        Some(r)
149    }
150}
151
152impl<'a> fmt::Display for Utf8Lossy<'a> {
153    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
154        // If we're the empty string then our iterator won't actually yield
155        // anything, so perform the formatting manually
156        if self.bytes.is_empty() {
157            return "".fmt(f);
158        }
159
160        for Utf8LossyChunk { valid, broken } in self.chunks() {
161            // If we successfully decoded the whole chunk as a valid string then
162            // we can return a direct formatting of the string which will also
163            // respect various formatting flags if possible.
164            if valid.len() == self.bytes.len() {
165                assert!(broken.is_empty());
166                return valid.fmt(f);
167            }
168
169            f.write_str(valid)?;
170            if !broken.is_empty() {
171                f.write_char(char::REPLACEMENT_CHARACTER)?;
172            }
173        }
174        Ok(())
175    }
176}
177
178impl<'a> fmt::Debug for Utf8Lossy<'a> {
179    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
180        f.write_char('"')?;
181
182        for Utf8LossyChunk { valid, broken } in self.chunks() {
183            // Valid part.
184            // Here we partially parse UTF-8 again which is suboptimal.
185            {
186                let mut from = 0;
187                for (i, c) in valid.char_indices() {
188                    let esc = c.escape_debug();
189                    // If char needs escaping, flush backlog so far and write, else skip
190                    if esc.len() != 1 {
191                        f.write_str(&valid[from..i])?;
192                        for c in esc {
193                            f.write_char(c)?;
194                        }
195                        from = i + c.len_utf8();
196                    }
197                }
198                f.write_str(&valid[from..])?;
199            }
200
201            // Broken parts of string as hex escape.
202            for &b in broken {
203                write!(f, "\\x{:02x}", b)?;
204            }
205        }
206
207        f.write_char('"')
208    }
209}