bumpalo/collections/str/
lossy.rs1use crate::collections::str as core_str;
12use core::char;
13use core::fmt;
14use core::fmt::Write;
15use core::str;
16
17pub struct Utf8Lossy<'a> {
19 bytes: &'a [u8],
20}
21
22impl<'a> Utf8Lossy<'a> {
23 pub fn from_bytes(bytes: &'a [u8]) -> Utf8Lossy<'a> {
24 Utf8Lossy { bytes }
25 }
26
27 pub fn chunks(&self) -> Utf8LossyChunksIter<'a> {
28 Utf8LossyChunksIter {
29 source: &self.bytes,
30 }
31 }
32}
33
34#[allow(missing_debug_implementations)]
36pub struct Utf8LossyChunksIter<'a> {
37 source: &'a [u8],
38}
39
40#[derive(PartialEq, Eq, Debug)]
41pub struct Utf8LossyChunk<'a> {
42 pub valid: &'a str,
45 pub broken: &'a [u8],
48}
49
50impl<'a> Iterator for Utf8LossyChunksIter<'a> {
51 type Item = Utf8LossyChunk<'a>;
52
53 fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
54 if self.source.is_empty() {
55 return None;
56 }
57
58 const TAG_CONT_U8: u8 = 128;
59 fn unsafe_get(xs: &[u8], i: usize) -> u8 {
60 unsafe { *xs.get_unchecked(i) }
61 }
62 fn safe_get(xs: &[u8], i: usize) -> u8 {
63 if i >= xs.len() {
64 0
65 } else {
66 unsafe_get(xs, i)
67 }
68 }
69
70 let mut i = 0;
71 while i < self.source.len() {
72 let i_ = i;
73
74 let byte = unsafe_get(self.source, i);
75 i += 1;
76
77 if byte < 128 {
78 } else {
79 let w = core_str::utf8_char_width(byte);
80
81 macro_rules! error {
82 () => {{
83 unsafe {
84 let r = Utf8LossyChunk {
85 valid: str::from_utf8_unchecked(&self.source[0..i_]),
86 broken: &self.source[i_..i],
87 };
88 self.source = &self.source[i..];
89 return Some(r);
90 }
91 }};
92 }
93
94 match w {
95 2 => {
96 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
97 error!();
98 }
99 i += 1;
100 }
101 3 => {
102 match (byte, safe_get(self.source, i)) {
103 (0xE0, 0xA0..=0xBF) => (),
104 (0xE1..=0xEC, 0x80..=0xBF) => (),
105 (0xED, 0x80..=0x9F) => (),
106 (0xEE..=0xEF, 0x80..=0xBF) => (),
107 _ => {
108 error!();
109 }
110 }
111 i += 1;
112 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
113 error!();
114 }
115 i += 1;
116 }
117 4 => {
118 match (byte, safe_get(self.source, i)) {
119 (0xF0, 0x90..=0xBF) => (),
120 (0xF1..=0xF3, 0x80..=0xBF) => (),
121 (0xF4, 0x80..=0x8F) => (),
122 _ => {
123 error!();
124 }
125 }
126 i += 1;
127 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
128 error!();
129 }
130 i += 1;
131 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
132 error!();
133 }
134 i += 1;
135 }
136 _ => {
137 error!();
138 }
139 }
140 }
141 }
142
143 let r = Utf8LossyChunk {
144 valid: unsafe { str::from_utf8_unchecked(self.source) },
145 broken: &[],
146 };
147 self.source = &[];
148 Some(r)
149 }
150}
151
152impl<'a> fmt::Display for Utf8Lossy<'a> {
153 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
154 if self.bytes.is_empty() {
157 return "".fmt(f);
158 }
159
160 for Utf8LossyChunk { valid, broken } in self.chunks() {
161 if valid.len() == self.bytes.len() {
165 assert!(broken.is_empty());
166 return valid.fmt(f);
167 }
168
169 f.write_str(valid)?;
170 if !broken.is_empty() {
171 f.write_char(char::REPLACEMENT_CHARACTER)?;
172 }
173 }
174 Ok(())
175 }
176}
177
178impl<'a> fmt::Debug for Utf8Lossy<'a> {
179 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
180 f.write_char('"')?;
181
182 for Utf8LossyChunk { valid, broken } in self.chunks() {
183 {
186 let mut from = 0;
187 for (i, c) in valid.char_indices() {
188 let esc = c.escape_debug();
189 if esc.len() != 1 {
191 f.write_str(&valid[from..i])?;
192 for c in esc {
193 f.write_char(c)?;
194 }
195 from = i + c.len_utf8();
196 }
197 }
198 f.write_str(&valid[from..])?;
199 }
200
201 for &b in broken {
203 write!(f, "\\x{:02x}", b)?;
204 }
205 }
206
207 f.write_char('"')
208 }
209}