diff --git a/src/uu/fmt/src/fmt.rs b/src/uu/fmt/src/fmt.rs index 882c0834a68..c06c5702b41 100644 --- a/src/uu/fmt/src/fmt.rs +++ b/src/uu/fmt/src/fmt.rs @@ -234,7 +234,7 @@ fn process_file( match para_result { Err(s) => { ostream - .write_all(s.as_bytes()) + .write_all(&s) .map_err_context(|| translate!("fmt-error-failed-to-write-output"))?; ostream .write_all(b"\n") diff --git a/src/uu/fmt/src/linebreak.rs b/src/uu/fmt/src/linebreak.rs index 653e7c3e049..a64728aeb00 100644 --- a/src/uu/fmt/src/linebreak.rs +++ b/src/uu/fmt/src/linebreak.rs @@ -14,7 +14,7 @@ use crate::parasplit::{ParaWords, Paragraph, WordInfo}; struct BreakArgs<'a> { opts: &'a FmtOptions, init_len: usize, - indent_str: &'a str, + indent: &'a [u8], indent_len: usize, uniform: bool, ostream: &'a mut BufWriter, @@ -59,11 +59,11 @@ pub fn break_lines( let p_init_len = winfo.word_nchars + if opts.crown || opts.tagged { // handle "init" portion - ostream.write_all(para.init_str.as_bytes())?; + ostream.write_all(¶.init_str)?; para.init_len } else if !para.mail_header { // for non-(crown, tagged) that's the same as a normal indent - ostream.write_all(p_indent.as_bytes())?; + ostream.write_all(p_indent)?; p_indent_len } else { // except that mail headers get no indent at all @@ -71,7 +71,7 @@ pub fn break_lines( }; // write first word after writing init - ostream.write_all(winfo.word.as_bytes())?; + ostream.write_all(winfo.word)?; // does this paragraph require uniform spacing? let uniform = para.mail_header || opts.uniform; @@ -79,7 +79,7 @@ pub fn break_lines( let mut break_args = BreakArgs { opts, init_len: p_init_len, - indent_str: p_indent, + indent: p_indent, indent_len: p_indent_len, uniform, ostream, @@ -121,7 +121,7 @@ fn accum_words_simple<'a>( ); if l + wlen + slen > args.opts.width { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; write_with_spaces(&winfo.word[winfo.word_start..], 0, args.ostream)?; Ok((args.indent_len + winfo.word_nchars, winfo.ends_punct)) } else { @@ -146,7 +146,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( (false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| { if fresh { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; } // at each breakpoint, keep emitting words until we find the word matching this breakpoint for winfo in &mut iter { @@ -167,7 +167,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( if std::ptr::eq(winfo, next_break) { // OK, we found the matching word if break_before { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; write_with_spaces(&winfo.word[winfo.word_start..], 0, args.ostream)?; } else { // breaking after this word, so that means "fresh" is true for the next iteration @@ -186,7 +186,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( // after the last linebreak, write out the rest of the final line. for winfo in iter { if fresh { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; } let (slen, word) = slice_if_fresh( fresh, @@ -474,13 +474,13 @@ fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> usize /// Otherwise, compute `slen` and leave whitespace alone. fn slice_if_fresh( fresh: bool, - word: &str, + word: &[u8], start: usize, uniform: bool, newline: bool, sstart: bool, punct: bool, -) -> (usize, &str) { +) -> (usize, &[u8]) { if fresh { (0, &word[start..]) } else { @@ -489,14 +489,14 @@ fn slice_if_fresh( } /// Write a newline and add the indent. -fn write_newline(indent: &str, ostream: &mut BufWriter) -> std::io::Result<()> { +fn write_newline(indent: &[u8], ostream: &mut BufWriter) -> std::io::Result<()> { ostream.write_all(b"\n")?; - ostream.write_all(indent.as_bytes()) + ostream.write_all(indent) } /// Write the word, along with slen spaces. fn write_with_spaces( - word: &str, + word: &[u8], slen: usize, ostream: &mut BufWriter, ) -> std::io::Result<()> { @@ -505,5 +505,5 @@ fn write_with_spaces( } else if slen == 1 { ostream.write_all(b" ")?; } - ostream.write_all(word.as_bytes()) + ostream.write_all(word) } diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index 3be410b8a59..7a583bde6a8 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -5,7 +5,7 @@ // spell-checker:ignore (ToDO) INFTY MULT PSKIP accum aftertab beforetab breakwords fmt's formatline linebreak linebreaking linebreaks linelen maxlength minlength nchars noformat noformatline ostream overlen parasplit plass pmatch poffset posn powf prefixindent punct signum slen sstart tabwidth tlen underlen winfo wlen wordlen wordsplits xanti xprefix -use std::io::{BufRead, Lines}; +use std::io::BufRead; use std::iter::Peekable; use std::slice::Iter; use unicode_width::UnicodeWidthChar; @@ -26,6 +26,47 @@ fn char_width(c: char) -> usize { } } +fn utf8_char_width(byte: u8) -> Option { + match byte { + 0x00..=0x7F => Some(1), + 0xC2..=0xDF => Some(2), + 0xE0..=0xEF => Some(3), + 0xF0..=0xF4 => Some(4), + _ => None, + } +} + +fn decode_char(bytes: &[u8], start: usize) -> (Option, usize) { + let first = bytes[start]; + if first < 0x80 { + return (Some(first as char), 1); + } + + let Some(width) = utf8_char_width(first) else { + return (None, 1); + }; + + if start + width > bytes.len() { + return (None, 1); + } + + match std::str::from_utf8(&bytes[start..start + width]) { + Ok(s) => (s.chars().next(), width), + Err(_) => (None, 1), + } +} + +fn byte_display_width(bytes: &[u8]) -> usize { + let mut width = 0; + let mut idx = 0; + while idx < bytes.len() { + let (ch, consumed) = decode_char(bytes, idx); + width += ch.map_or(1, char_width); + idx += consumed; + } + width +} + /// GNU fmt has a more restrictive definition of whitespace than Unicode. /// It only considers ASCII whitespace characters (space, tab, newline, etc.) /// and excludes many Unicode whitespace characters like non-breaking spaces. @@ -34,12 +75,16 @@ fn is_fmt_whitespace(c: char) -> bool { matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C') } +fn is_fmt_whitespace_byte(b: u8) -> bool { + matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C) +} + // lines with PSKIP, lacking PREFIX, or which are entirely blank are // NoFormatLines; otherwise, they are FormatLines #[derive(Debug)] pub enum Line { FormatLine(FileLine), - NoFormatLine(String, bool), + NoFormatLine(Vec, bool), } impl Line { @@ -52,7 +97,7 @@ impl Line { } /// when we know that it's a [`Line::NoFormatLine`], as in the [`ParagraphStream`] iterator - fn get_noformatline(self) -> (String, bool) { + fn get_noformatline(self) -> (Vec, bool) { match self { Self::NoFormatLine(s, b) => (s, b), Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"), @@ -64,7 +109,7 @@ impl Line { /// the next line or not #[derive(Debug)] pub struct FileLine { - line: String, + line: Vec, /// The end of the indent, always the start of the text indent_end: usize, /// The end of the PREFIX's indent, that is, the spaces before the prefix @@ -78,75 +123,86 @@ pub struct FileLine { /// Iterator that produces a stream of Lines from a file pub struct FileLines<'a> { opts: &'a FmtOptions, - lines: Lines<&'a mut FileOrStdReader>, + reader: &'a mut FileOrStdReader, } impl FileLines<'_> { - fn new<'b>(opts: &'b FmtOptions, lines: Lines<&'b mut FileOrStdReader>) -> FileLines<'b> { - FileLines { opts, lines } + fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> FileLines<'b> { + FileLines { opts, reader } } /// returns true if this line should be formatted - fn match_prefix(&self, line: &str) -> (bool, usize) { + fn match_prefix(&self, line: &[u8]) -> (bool, usize) { let Some(prefix) = &self.opts.prefix else { return (true, 0); }; - FileLines::match_prefix_generic(prefix, line, self.opts.xprefix) + FileLines::match_prefix_generic(prefix.as_bytes(), line, self.opts.xprefix) } /// returns true if this line should be formatted - fn match_anti_prefix(&self, line: &str) -> bool { + fn match_anti_prefix(&self, line: &[u8]) -> bool { let Some(anti_prefix) = &self.opts.anti_prefix else { return true; }; - match FileLines::match_prefix_generic(anti_prefix, line, self.opts.xanti_prefix) { + match FileLines::match_prefix_generic(anti_prefix.as_bytes(), line, self.opts.xanti_prefix) + { (true, _) => false, (_, _) => true, } } - fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) { + fn match_prefix_generic(pfx: &[u8], line: &[u8], exact: bool) -> (bool, usize) { if line.starts_with(pfx) { return (true, 0); } if !exact { - // we do it this way rather than byte indexing to support unicode whitespace chars - for (i, char) in line.char_indices() { + let mut i = 0; + while i < line.len() { if line[i..].starts_with(pfx) { return (true, i); - } else if !is_fmt_whitespace(char) { + } else if !is_fmt_whitespace_byte(line[i]) { break; } + i += 1; } } (false, 0) } - fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) { + fn compute_indent(&self, bytes: &[u8], prefix_end: usize) -> (usize, usize, usize) { let mut prefix_len = 0; let mut indent_len = 0; - let mut indent_end = 0; - for (os, c) in string.char_indices() { - if os == prefix_end { + let mut indent_end = bytes.len(); + let mut idx = 0; + while idx < bytes.len() { + if idx == prefix_end { // we found the end of the prefix, so this is the printed length of the prefix here prefix_len = indent_len; } - if (os >= prefix_end) && !is_fmt_whitespace(c) { - // found first non-whitespace after prefix, this is indent_end - indent_end = os; + let byte = bytes[idx]; + if idx >= prefix_end && !is_fmt_whitespace_byte(byte) { + indent_end = idx; break; - } else if c == '\t' { - // compute tab length + } + + if byte == b'\t' { indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; - } else { - // non-tab character - indent_len += char_width(c); + idx += 1; + continue; } + + let (ch, consumed) = decode_char(bytes, idx); + indent_len += ch.map_or(1, char_width); + idx += consumed; + continue; + } + if indent_end == bytes.len() { + indent_end = idx; } (indent_end, prefix_len, indent_len) } @@ -156,14 +212,26 @@ impl Iterator for FileLines<'_> { type Item = Line; fn next(&mut self) -> Option { - let n = self.lines.next()?.ok()?; + let mut buf = Vec::new(); + match self.reader.read_until(b'\n', &mut buf) { + Ok(0) => return None, + Ok(_) => {} + Err(_) => return None, + } + if buf.ends_with(b"\n") { + buf.pop(); + if buf.ends_with(b"\r") { + buf.pop(); + } + } + let n = buf; // if this line is entirely whitespace, // emit a blank line // Err(true) indicates that this was a linebreak, // which is important to know when detecting mail headers - if n.chars().all(is_fmt_whitespace) { - return Some(Line::NoFormatLine(String::new(), true)); + if n.iter().all(|&b| is_fmt_whitespace_byte(b)) { + return Some(Line::NoFormatLine(Vec::new(), true)); } let (pmatch, poffset) = self.match_prefix(&n[..]); @@ -181,8 +249,8 @@ impl Iterator for FileLines<'_> { // following line) if pmatch && n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..] - .chars() - .all(is_fmt_whitespace) + .iter() + .all(|&b| is_fmt_whitespace_byte(b)) { return Some(Line::NoFormatLine(n, false)); } @@ -210,20 +278,20 @@ impl Iterator for FileLines<'_> { /// A paragraph : a collection of [`FileLines`] that are to be formatted /// plus info about the paragraph's indentation /// -/// We only retain the String from the [`FileLine`]; the other info +/// We retain the raw bytes from the [`FileLine`]; the other info /// is only there to help us in deciding how to merge lines into Paragraphs #[derive(Debug)] pub struct Paragraph { /// the lines of the file - lines: Vec, + lines: Vec>, /// string representing the init, that is, the first line's indent - pub init_str: String, + pub init_str: Vec, /// printable length of the init string considering TABWIDTH pub init_len: usize, - /// byte location of end of init in first line String + /// byte location of end of init in first line buffer init_end: usize, /// string representing indent - pub indent_str: String, + pub indent_str: Vec, /// length of above pub indent_len: usize, /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward) @@ -242,7 +310,7 @@ pub struct ParagraphStream<'a> { impl ParagraphStream<'_> { pub fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> ParagraphStream<'b> { - let lines = FileLines::new(opts, reader.lines()).peekable(); + let lines = FileLines::new(opts, reader).peekable(); // at the beginning of the file, we might find mail headers ParagraphStream { lines, @@ -260,10 +328,10 @@ impl ParagraphStream<'_> { false } else { let l_slice = &line.line[..]; - if l_slice.starts_with("From ") { + if l_slice.starts_with(b"From ") { true } else { - let Some(colon_posn) = l_slice.find(':') else { + let Some(colon_posn) = l_slice.iter().position(|&b| b == b':') else { return false; }; @@ -273,18 +341,18 @@ impl ParagraphStream<'_> { } l_slice[..colon_posn] - .chars() - .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y))) + .iter() + .all(|&b| (33..=126).contains(&(b as usize)) && b != b':') } } } } impl Iterator for ParagraphStream<'_> { - type Item = Result; + type Item = Result>; #[allow(clippy::cognitive_complexity)] - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option>> { // return a NoFormatLine in an Err; it should immediately be output let noformat = match self.lines.peek()? { Line::FormatLine(_) => false, @@ -299,10 +367,10 @@ impl Iterator for ParagraphStream<'_> { } // found a FormatLine, now build a paragraph - let mut init_str = String::new(); + let mut init_str = Vec::new(); let mut init_end = 0; let mut init_len = 0; - let mut indent_str = String::new(); + let mut indent_str = Vec::new(); let mut indent_end = 0; let mut indent_len = 0; let mut prefix_len = 0; @@ -326,11 +394,11 @@ impl Iterator for ParagraphStream<'_> { // there can't be any indent or prefixindent because otherwise is_mail_header // would fail since there cannot be any whitespace before the colon in a // valid header field - indent_str.push_str(" "); + indent_str.extend_from_slice(b" "); indent_len = 2; } else { if self.opts.crown || self.opts.tagged { - init_str.push_str(&fl.line[..fl.indent_end]); + init_str.extend_from_slice(&fl.line[..fl.indent_end]); init_len = fl.indent_len; init_end = fl.indent_end; } else { @@ -340,7 +408,7 @@ impl Iterator for ParagraphStream<'_> { // these will be overwritten in the 2nd line of crown or tagged mode, but // we are not guaranteed to get to the 2nd line, e.g., if the next line // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around - indent_str.push_str(&fl.line[..fl.indent_end]); + indent_str.extend_from_slice(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; @@ -354,7 +422,7 @@ impl Iterator for ParagraphStream<'_> { // pretty arbitrary. // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big. if self.opts.tagged { - indent_str.push_str(" "); + indent_str.extend_from_slice(b" "); indent_len += 4; } } @@ -381,7 +449,7 @@ impl Iterator for ParagraphStream<'_> { // this is part of the same paragraph, get the indent info from this line indent_str.clear(); - indent_str.push_str(&fl.line[..fl.indent_end]); + indent_str.extend_from_slice(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; @@ -449,11 +517,14 @@ impl<'a> ParaWords<'a> { self.para .lines .iter() - .flat_map(|x| x.split_whitespace()) + .flat_map(|x| { + x.split(|b| is_fmt_whitespace_byte(*b)) + .filter(|segment| !segment.is_empty()) + }) .map(|x| WordInfo { word: x, word_start: 0, - word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped) + word_nchars: byte_display_width(x), before_tab: None, after_tab: 0, sentence_start: false, @@ -492,24 +563,22 @@ impl<'a> ParaWords<'a> { struct WordSplit<'a> { opts: &'a FmtOptions, - string: &'a str, + bytes: &'a [u8], length: usize, position: usize, prev_punct: bool, } impl WordSplit<'_> { - fn analyze_tabs(&self, string: &str) -> (Option, usize, Option) { - // given a string, determine (length before tab) and (printed length after first tab) - // if there are no tabs, beforetab = -1 and aftertab is the printed length + fn analyze_tabs(&self, bytes: &[u8]) -> (Option, usize, Option) { let mut beforetab = None; let mut aftertab = 0; let mut word_start = None; - for (os, c) in string.char_indices() { - if !is_fmt_whitespace(c) { - word_start = Some(os); + for (idx, b) in bytes.iter().enumerate() { + if !is_fmt_whitespace_byte(*b) { + word_start = Some(idx); break; - } else if c == '\t' { + } else if *b == b'\t' { if beforetab.is_none() { beforetab = Some(aftertab); aftertab = 0; @@ -522,28 +591,29 @@ impl WordSplit<'_> { } (beforetab, aftertab, word_start) } -} -impl WordSplit<'_> { - fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> { - // wordsplits *must* start at a non-whitespace character - let trim_string = string.trim_start_matches(is_fmt_whitespace); + fn new<'b>(opts: &'b FmtOptions, bytes: &'b [u8]) -> WordSplit<'b> { + let start = bytes + .iter() + .position(|&b| !is_fmt_whitespace_byte(b)) + .unwrap_or(bytes.len()); + let trimmed = &bytes[start..]; WordSplit { opts, - string: trim_string, - length: string.len(), + bytes: trimmed, + length: trimmed.len(), position: 0, prev_punct: false, } } - fn is_punctuation(c: char) -> bool { - matches!(c, '!' | '.' | '?') + fn is_punctuation_byte(b: u8) -> bool { + matches!(b, b'!' | b'.' | b'?') } } pub struct WordInfo<'a> { - pub word: &'a str, + pub word: &'a [u8], pub word_start: usize, pub word_nchars: usize, pub before_tab: Option, @@ -567,7 +637,7 @@ impl<'a> Iterator for WordSplit<'a> { // find the start of the next word, and record if we find a tab character let (before_tab, after_tab, word_start) = - if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) { + if let (b, a, Some(s)) = self.analyze_tabs(&self.bytes[old_position..]) { (b, a, s + old_position) } else { self.position = self.length; @@ -578,17 +648,27 @@ impl<'a> Iterator for WordSplit<'a> { // note that this preserves the invariant that self.position // points to whitespace character OR end of string let mut word_nchars = 0; - self.position = match self.string[word_start..].find(|x: char| { - if is_fmt_whitespace(x) { - true + let mut idx = word_start; + let mut last_ascii = None; + while idx < self.length { + let (ch, consumed) = decode_char(self.bytes, idx); + let is_whitespace = ch.filter(|c| c.is_ascii()).is_some_and(is_fmt_whitespace); + if is_whitespace { + break; + } + word_nchars += ch.map_or(1, char_width); + if let Some(ch) = ch { + if ch.is_ascii() { + last_ascii = Some(ch as u8); + } else { + last_ascii = None; + } } else { - word_nchars += char_width(x); - false + last_ascii = None; } - }) { - None => self.length, - Some(s) => s + word_start, - }; + idx += consumed; + } + self.position = idx; let word_start_relative = word_start - old_position; // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence. @@ -596,16 +676,14 @@ impl<'a> Iterator for WordSplit<'a> { self.prev_punct && (before_tab.is_some() || word_start_relative > 1); // now record whether this word ends in punctuation - self.prev_punct = match self.string[..self.position].chars().next_back() { - Some(ch) => WordSplit::is_punctuation(ch), - _ => panic!("fatal: expected word not to be empty"), - }; + let ends_punct = last_ascii.is_some_and(WordSplit::is_punctuation_byte); + self.prev_punct = ends_punct; let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform { - (&self.string[word_start..self.position], 0, None, 0) + (&self.bytes[word_start..self.position], 0, None, 0) } else { ( - &self.string[old_position..self.position], + &self.bytes[old_position..self.position], word_start_relative, before_tab, after_tab, @@ -619,7 +697,7 @@ impl<'a> Iterator for WordSplit<'a> { before_tab, after_tab, sentence_start: is_start_of_sentence, - ends_punct: self.prev_punct, + ends_punct, new_line, }) } diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 5959569de6f..66d08817eda 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore plass samp +// spell-checker:ignore plass samp FFFD #[cfg(target_os = "linux")] use std::os::unix::ffi::OsStringExt; use uutests::new_ucmd; @@ -323,6 +323,8 @@ fn test_fmt_unicode_whitespace_handling() { ("non-breaking space", non_breaking_space), ("figure space", figure_space), ("narrow no-break space", narrow_no_break_space), + ("word joiner", "\u{2060}"), + ("cyrillic kha", "\u{0445}"), ] { let input = format!("={char}="); let result = new_ucmd!() @@ -397,3 +399,17 @@ fn fmt_reflow_unicode() { .succeeds() .stdout_is("漢字漢字\n💐\n日本語の文字\n"); } + +#[test] +fn test_fmt_invalid_utf8() { + // Regression test for handling invalid UTF-8 input (e.g. ISO-8859-1) + // fmt should not drop lines with invalid UTF-8. + // \xA0 is non-breaking space in ISO-8859-1, but invalid in UTF-8. + // We expect GNU-compatible passthrough of the raw byte, not lossy replacement. + let input = b"=\xA0="; + new_ucmd!() + .args(&["-s", "-w1"]) + .pipe_in(input) + .succeeds() + .stdout_is_bytes(b"=\xA0=\n"); +}