From 073f7fce7d910142ad66fc1939dd0eef8cc0d3dc Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 16:02:41 +0900 Subject: [PATCH 1/7] test: add word joiner and cyrillic kha character tests for fmt --- tests/by-util/test_fmt.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 5959569de6f..8eb97d2090e 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -323,6 +323,8 @@ fn test_fmt_unicode_whitespace_handling() { ("non-breaking space", non_breaking_space), ("figure space", figure_space), ("narrow no-break space", narrow_no_break_space), + ("word joiner", "\u{2060}"), + ("cyrillic kha", "\u{0445}"), ] { let input = format!("={char}="); let result = new_ucmd!() From 36a01a1c8946c0e1ea2cd317a4212a6efa007a1d Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 16:29:27 +0900 Subject: [PATCH 2/7] feat: Enhance `fmt` to handle invalid UTF-8 input by replacing malformed sequences instead of dropping lines. --- src/uu/fmt/src/parasplit.rs | 24 ++++++++++++++++++------ tests/by-util/test_fmt.rs | 14 ++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index 3be410b8a59..e77c54662bd 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -5,7 +5,7 @@ // spell-checker:ignore (ToDO) INFTY MULT PSKIP accum aftertab beforetab breakwords fmt's formatline linebreak linebreaking linebreaks linelen maxlength minlength nchars noformat noformatline ostream overlen parasplit plass pmatch poffset posn powf prefixindent punct signum slen sstart tabwidth tlen underlen winfo wlen wordlen wordsplits xanti xprefix -use std::io::{BufRead, Lines}; +use std::io::BufRead; use std::iter::Peekable; use std::slice::Iter; use unicode_width::UnicodeWidthChar; @@ -78,12 +78,12 @@ pub struct FileLine { /// Iterator that produces a stream of Lines from a file pub struct FileLines<'a> { opts: &'a FmtOptions, - lines: Lines<&'a mut FileOrStdReader>, + reader: &'a mut FileOrStdReader, } impl FileLines<'_> { - fn new<'b>(opts: &'b FmtOptions, lines: Lines<&'b mut FileOrStdReader>) -> FileLines<'b> { - FileLines { opts, lines } + fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> FileLines<'b> { + FileLines { opts, reader } } /// returns true if this line should be formatted @@ -156,7 +156,19 @@ impl Iterator for FileLines<'_> { type Item = Line; fn next(&mut self) -> Option { - let n = self.lines.next()?.ok()?; + let mut buf = Vec::new(); + match self.reader.read_until(b'\n', &mut buf) { + Ok(0) => return None, + Ok(_) => {} + Err(_) => return None, + } + if buf.ends_with(b"\n") { + buf.pop(); + if buf.ends_with(b"\r") { + buf.pop(); + } + } + let n = String::from_utf8_lossy(&buf).into_owned(); // if this line is entirely whitespace, // emit a blank line @@ -242,7 +254,7 @@ pub struct ParagraphStream<'a> { impl ParagraphStream<'_> { pub fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> ParagraphStream<'b> { - let lines = FileLines::new(opts, reader.lines()).peekable(); + let lines = FileLines::new(opts, reader).peekable(); // at the beginning of the file, we might find mail headers ParagraphStream { lines, diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 8eb97d2090e..8319922ad68 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -399,3 +399,17 @@ fn fmt_reflow_unicode() { .succeeds() .stdout_is("漢字漢字\n💐\n日本語の文字\n"); } + +#[test] +fn test_fmt_invalid_utf8() { + // Regression test for handling invalid UTF-8 input (e.g. ISO-8859-1) + // fmt should not drop lines with invalid UTF-8. + // \xA0 is non-breaking space in ISO-8859-1, but invalid in UTF-8. + // We expect it to be replaced by replacement character and treated as non-space. + let input = b"=\xA0="; + new_ucmd!() + .args(&["-s", "-w1"]) + .pipe_in(input) + .succeeds() + .stdout_is("=\u{FFFD}=\n"); +} From 2c617d4af28160e8d3be9668fc814fb86a50ae5c Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 16:44:28 +0900 Subject: [PATCH 3/7] chore: add FFFD to spell-checker ignore list in fmt test. --- tests/by-util/test_fmt.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 8319922ad68..7ba051f1e5e 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore plass samp +// spell-checker:ignore plass samp FFFD #[cfg(target_os = "linux")] use std::os::unix::ffi::OsStringExt; use uutests::new_ucmd; From db77543abe8417a3c715ad3d441c684c1d3bd9f0 Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 20:16:31 +0900 Subject: [PATCH 4/7] refactor(uu/fmt): switch string outputs to byte slices for efficiency - Changed `indent_str` field in `BreakArgs` to `indent: &[u8]` to avoid repeated UTF-8 conversions. - Updated `write_all` calls to pass `&s` instead of `s.as_bytes()` in fmt.rs and similar string/byteslicing in linebreak.rs. - Modified method signatures in parasplit.rs to accept `&[u8]` instead of `&str` for prefix matching, ensuring consistent byte-level operations without assuming valid UTF-8. --- src/uu/fmt/src/fmt.rs | 2 +- src/uu/fmt/src/linebreak.rs | 30 ++--- src/uu/fmt/src/parasplit.rs | 230 +++++++++++++++++++++++------------- 3 files changed, 164 insertions(+), 98 deletions(-) diff --git a/src/uu/fmt/src/fmt.rs b/src/uu/fmt/src/fmt.rs index 882c0834a68..c06c5702b41 100644 --- a/src/uu/fmt/src/fmt.rs +++ b/src/uu/fmt/src/fmt.rs @@ -234,7 +234,7 @@ fn process_file( match para_result { Err(s) => { ostream - .write_all(s.as_bytes()) + .write_all(&s) .map_err_context(|| translate!("fmt-error-failed-to-write-output"))?; ostream .write_all(b"\n") diff --git a/src/uu/fmt/src/linebreak.rs b/src/uu/fmt/src/linebreak.rs index 653e7c3e049..a64728aeb00 100644 --- a/src/uu/fmt/src/linebreak.rs +++ b/src/uu/fmt/src/linebreak.rs @@ -14,7 +14,7 @@ use crate::parasplit::{ParaWords, Paragraph, WordInfo}; struct BreakArgs<'a> { opts: &'a FmtOptions, init_len: usize, - indent_str: &'a str, + indent: &'a [u8], indent_len: usize, uniform: bool, ostream: &'a mut BufWriter, @@ -59,11 +59,11 @@ pub fn break_lines( let p_init_len = winfo.word_nchars + if opts.crown || opts.tagged { // handle "init" portion - ostream.write_all(para.init_str.as_bytes())?; + ostream.write_all(¶.init_str)?; para.init_len } else if !para.mail_header { // for non-(crown, tagged) that's the same as a normal indent - ostream.write_all(p_indent.as_bytes())?; + ostream.write_all(p_indent)?; p_indent_len } else { // except that mail headers get no indent at all @@ -71,7 +71,7 @@ pub fn break_lines( }; // write first word after writing init - ostream.write_all(winfo.word.as_bytes())?; + ostream.write_all(winfo.word)?; // does this paragraph require uniform spacing? let uniform = para.mail_header || opts.uniform; @@ -79,7 +79,7 @@ pub fn break_lines( let mut break_args = BreakArgs { opts, init_len: p_init_len, - indent_str: p_indent, + indent: p_indent, indent_len: p_indent_len, uniform, ostream, @@ -121,7 +121,7 @@ fn accum_words_simple<'a>( ); if l + wlen + slen > args.opts.width { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; write_with_spaces(&winfo.word[winfo.word_start..], 0, args.ostream)?; Ok((args.indent_len + winfo.word_nchars, winfo.ends_punct)) } else { @@ -146,7 +146,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( (false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| { if fresh { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; } // at each breakpoint, keep emitting words until we find the word matching this breakpoint for winfo in &mut iter { @@ -167,7 +167,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( if std::ptr::eq(winfo, next_break) { // OK, we found the matching word if break_before { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; write_with_spaces(&winfo.word[winfo.word_start..], 0, args.ostream)?; } else { // breaking after this word, so that means "fresh" is true for the next iteration @@ -186,7 +186,7 @@ fn break_knuth_plass<'a, T: Clone + Iterator>>( // after the last linebreak, write out the rest of the final line. for winfo in iter { if fresh { - write_newline(args.indent_str, args.ostream)?; + write_newline(args.indent, args.ostream)?; } let (slen, word) = slice_if_fresh( fresh, @@ -474,13 +474,13 @@ fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> usize /// Otherwise, compute `slen` and leave whitespace alone. fn slice_if_fresh( fresh: bool, - word: &str, + word: &[u8], start: usize, uniform: bool, newline: bool, sstart: bool, punct: bool, -) -> (usize, &str) { +) -> (usize, &[u8]) { if fresh { (0, &word[start..]) } else { @@ -489,14 +489,14 @@ fn slice_if_fresh( } /// Write a newline and add the indent. -fn write_newline(indent: &str, ostream: &mut BufWriter) -> std::io::Result<()> { +fn write_newline(indent: &[u8], ostream: &mut BufWriter) -> std::io::Result<()> { ostream.write_all(b"\n")?; - ostream.write_all(indent.as_bytes()) + ostream.write_all(indent) } /// Write the word, along with slen spaces. fn write_with_spaces( - word: &str, + word: &[u8], slen: usize, ostream: &mut BufWriter, ) -> std::io::Result<()> { @@ -505,5 +505,5 @@ fn write_with_spaces( } else if slen == 1 { ostream.write_all(b" ")?; } - ostream.write_all(word.as_bytes()) + ostream.write_all(word) } diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index e77c54662bd..920a8d14fb8 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -26,6 +26,47 @@ fn char_width(c: char) -> usize { } } +fn utf8_char_width(byte: u8) -> Option { + match byte { + 0x00..=0x7F => Some(1), + 0xC2..=0xDF => Some(2), + 0xE0..=0xEF => Some(3), + 0xF0..=0xF4 => Some(4), + _ => None, + } +} + +fn decode_char(bytes: &[u8], start: usize) -> (Option, usize) { + let first = bytes[start]; + if first < 0x80 { + return (Some(first as char), 1); + } + + let Some(width) = utf8_char_width(first) else { + return (None, 1); + }; + + if start + width > bytes.len() { + return (None, 1); + } + + match std::str::from_utf8(&bytes[start..start + width]) { + Ok(s) => (s.chars().next(), width), + Err(_) => (None, 1), + } +} + +fn byte_display_width(bytes: &[u8]) -> usize { + let mut width = 0; + let mut idx = 0; + while idx < bytes.len() { + let (ch, consumed) = decode_char(bytes, idx); + width += ch.map_or(1, char_width); + idx += consumed; + } + width +} + /// GNU fmt has a more restrictive definition of whitespace than Unicode. /// It only considers ASCII whitespace characters (space, tab, newline, etc.) /// and excludes many Unicode whitespace characters like non-breaking spaces. @@ -34,12 +75,16 @@ fn is_fmt_whitespace(c: char) -> bool { matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C') } +fn is_fmt_whitespace_byte(b: u8) -> bool { + matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C) +} + // lines with PSKIP, lacking PREFIX, or which are entirely blank are // NoFormatLines; otherwise, they are FormatLines #[derive(Debug)] pub enum Line { FormatLine(FileLine), - NoFormatLine(String, bool), + NoFormatLine(Vec, bool), } impl Line { @@ -52,7 +97,7 @@ impl Line { } /// when we know that it's a [`Line::NoFormatLine`], as in the [`ParagraphStream`] iterator - fn get_noformatline(self) -> (String, bool) { + fn get_noformatline(self) -> (Vec, bool) { match self { Self::NoFormatLine(s, b) => (s, b), Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"), @@ -64,7 +109,7 @@ impl Line { /// the next line or not #[derive(Debug)] pub struct FileLine { - line: String, + line: Vec, /// The end of the indent, always the start of the text indent_end: usize, /// The end of the PREFIX's indent, that is, the spaces before the prefix @@ -87,67 +132,76 @@ impl FileLines<'_> { } /// returns true if this line should be formatted - fn match_prefix(&self, line: &str) -> (bool, usize) { + fn match_prefix(&self, line: &[u8]) -> (bool, usize) { let Some(prefix) = &self.opts.prefix else { return (true, 0); }; - FileLines::match_prefix_generic(prefix, line, self.opts.xprefix) + FileLines::match_prefix_generic(prefix.as_bytes(), line, self.opts.xprefix) } /// returns true if this line should be formatted - fn match_anti_prefix(&self, line: &str) -> bool { + fn match_anti_prefix(&self, line: &[u8]) -> bool { let Some(anti_prefix) = &self.opts.anti_prefix else { return true; }; - match FileLines::match_prefix_generic(anti_prefix, line, self.opts.xanti_prefix) { + match FileLines::match_prefix_generic(anti_prefix.as_bytes(), line, self.opts.xanti_prefix) + { (true, _) => false, (_, _) => true, } } - fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, usize) { + fn match_prefix_generic(pfx: &[u8], line: &[u8], exact: bool) -> (bool, usize) { if line.starts_with(pfx) { return (true, 0); } if !exact { - // we do it this way rather than byte indexing to support unicode whitespace chars - for (i, char) in line.char_indices() { + let mut i = 0; + while i < line.len() { if line[i..].starts_with(pfx) { return (true, i); - } else if !is_fmt_whitespace(char) { + } else if !is_fmt_whitespace_byte(line[i]) { break; } + i += 1; } } (false, 0) } - fn compute_indent(&self, string: &str, prefix_end: usize) -> (usize, usize, usize) { + fn compute_indent(&self, bytes: &[u8], prefix_end: usize) -> (usize, usize, usize) { let mut prefix_len = 0; let mut indent_len = 0; - let mut indent_end = 0; - for (os, c) in string.char_indices() { - if os == prefix_end { + let mut indent_end = bytes.len(); + let mut idx = 0; + while idx < bytes.len() { + if idx == prefix_end { // we found the end of the prefix, so this is the printed length of the prefix here prefix_len = indent_len; } - if (os >= prefix_end) && !is_fmt_whitespace(c) { - // found first non-whitespace after prefix, this is indent_end - indent_end = os; + let byte = bytes[idx]; + if idx >= prefix_end && !is_fmt_whitespace_byte(byte) { + indent_end = idx; break; - } else if c == '\t' { - // compute tab length + } else if byte == b'\t' { indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; + idx += 1; + continue; } else { - // non-tab character - indent_len += char_width(c); + let (ch, consumed) = decode_char(bytes, idx); + indent_len += ch.map_or(1, char_width); + idx += consumed; + continue; } } + if indent_end == bytes.len() { + indent_end = idx; + } (indent_end, prefix_len, indent_len) } } @@ -168,14 +222,14 @@ impl Iterator for FileLines<'_> { buf.pop(); } } - let n = String::from_utf8_lossy(&buf).into_owned(); + let n = buf; // if this line is entirely whitespace, // emit a blank line // Err(true) indicates that this was a linebreak, // which is important to know when detecting mail headers - if n.chars().all(is_fmt_whitespace) { - return Some(Line::NoFormatLine(String::new(), true)); + if n.iter().all(|&b| is_fmt_whitespace_byte(b)) { + return Some(Line::NoFormatLine(Vec::new(), true)); } let (pmatch, poffset) = self.match_prefix(&n[..]); @@ -193,8 +247,8 @@ impl Iterator for FileLines<'_> { // following line) if pmatch && n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..] - .chars() - .all(is_fmt_whitespace) + .iter() + .all(|&b| is_fmt_whitespace_byte(b)) { return Some(Line::NoFormatLine(n, false)); } @@ -222,20 +276,20 @@ impl Iterator for FileLines<'_> { /// A paragraph : a collection of [`FileLines`] that are to be formatted /// plus info about the paragraph's indentation /// -/// We only retain the String from the [`FileLine`]; the other info +/// We retain the raw bytes from the [`FileLine`]; the other info /// is only there to help us in deciding how to merge lines into Paragraphs #[derive(Debug)] pub struct Paragraph { /// the lines of the file - lines: Vec, + lines: Vec>, /// string representing the init, that is, the first line's indent - pub init_str: String, + pub init_str: Vec, /// printable length of the init string considering TABWIDTH pub init_len: usize, - /// byte location of end of init in first line String + /// byte location of end of init in first line buffer init_end: usize, /// string representing indent - pub indent_str: String, + pub indent_str: Vec, /// length of above pub indent_len: usize, /// byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward) @@ -272,10 +326,10 @@ impl ParagraphStream<'_> { false } else { let l_slice = &line.line[..]; - if l_slice.starts_with("From ") { + if l_slice.starts_with(b"From ") { true } else { - let Some(colon_posn) = l_slice.find(':') else { + let Some(colon_posn) = l_slice.iter().position(|&b| b == b':') else { return false; }; @@ -285,18 +339,18 @@ impl ParagraphStream<'_> { } l_slice[..colon_posn] - .chars() - .all(|x| !matches!(x as usize, y if !(33..=126).contains(&y))) + .iter() + .all(|&b| (33..=126).contains(&(b as usize)) && b != b':') } } } } impl Iterator for ParagraphStream<'_> { - type Item = Result; + type Item = Result>; #[allow(clippy::cognitive_complexity)] - fn next(&mut self) -> Option> { + fn next(&mut self) -> Option>> { // return a NoFormatLine in an Err; it should immediately be output let noformat = match self.lines.peek()? { Line::FormatLine(_) => false, @@ -311,10 +365,10 @@ impl Iterator for ParagraphStream<'_> { } // found a FormatLine, now build a paragraph - let mut init_str = String::new(); + let mut init_str = Vec::new(); let mut init_end = 0; let mut init_len = 0; - let mut indent_str = String::new(); + let mut indent_str = Vec::new(); let mut indent_end = 0; let mut indent_len = 0; let mut prefix_len = 0; @@ -338,11 +392,11 @@ impl Iterator for ParagraphStream<'_> { // there can't be any indent or prefixindent because otherwise is_mail_header // would fail since there cannot be any whitespace before the colon in a // valid header field - indent_str.push_str(" "); + indent_str.extend_from_slice(b" "); indent_len = 2; } else { if self.opts.crown || self.opts.tagged { - init_str.push_str(&fl.line[..fl.indent_end]); + init_str.extend_from_slice(&fl.line[..fl.indent_end]); init_len = fl.indent_len; init_end = fl.indent_end; } else { @@ -352,7 +406,7 @@ impl Iterator for ParagraphStream<'_> { // these will be overwritten in the 2nd line of crown or tagged mode, but // we are not guaranteed to get to the 2nd line, e.g., if the next line // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around - indent_str.push_str(&fl.line[..fl.indent_end]); + indent_str.extend_from_slice(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; @@ -366,7 +420,7 @@ impl Iterator for ParagraphStream<'_> { // pretty arbitrary. // Perhaps a better default would be 1 TABWIDTH? But ugh that's so big. if self.opts.tagged { - indent_str.push_str(" "); + indent_str.extend_from_slice(b" "); indent_len += 4; } } @@ -393,7 +447,7 @@ impl Iterator for ParagraphStream<'_> { // this is part of the same paragraph, get the indent info from this line indent_str.clear(); - indent_str.push_str(&fl.line[..fl.indent_end]); + indent_str.extend_from_slice(&fl.line[..fl.indent_end]); indent_len = fl.indent_len; indent_end = fl.indent_end; @@ -461,11 +515,14 @@ impl<'a> ParaWords<'a> { self.para .lines .iter() - .flat_map(|x| x.split_whitespace()) + .flat_map(|x| { + x.split(|b| is_fmt_whitespace_byte(*b)) + .filter(|segment| !segment.is_empty()) + }) .map(|x| WordInfo { word: x, word_start: 0, - word_nchars: x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped) + word_nchars: byte_display_width(x), before_tab: None, after_tab: 0, sentence_start: false, @@ -504,24 +561,22 @@ impl<'a> ParaWords<'a> { struct WordSplit<'a> { opts: &'a FmtOptions, - string: &'a str, + bytes: &'a [u8], length: usize, position: usize, prev_punct: bool, } impl WordSplit<'_> { - fn analyze_tabs(&self, string: &str) -> (Option, usize, Option) { - // given a string, determine (length before tab) and (printed length after first tab) - // if there are no tabs, beforetab = -1 and aftertab is the printed length + fn analyze_tabs(&self, bytes: &[u8]) -> (Option, usize, Option) { let mut beforetab = None; let mut aftertab = 0; let mut word_start = None; - for (os, c) in string.char_indices() { - if !is_fmt_whitespace(c) { - word_start = Some(os); + for (idx, b) in bytes.iter().enumerate() { + if !is_fmt_whitespace_byte(*b) { + word_start = Some(idx); break; - } else if c == '\t' { + } else if *b == b'\t' { if beforetab.is_none() { beforetab = Some(aftertab); aftertab = 0; @@ -534,28 +589,29 @@ impl WordSplit<'_> { } (beforetab, aftertab, word_start) } -} -impl WordSplit<'_> { - fn new<'b>(opts: &'b FmtOptions, string: &'b str) -> WordSplit<'b> { - // wordsplits *must* start at a non-whitespace character - let trim_string = string.trim_start_matches(is_fmt_whitespace); + fn new<'b>(opts: &'b FmtOptions, bytes: &'b [u8]) -> WordSplit<'b> { + let start = bytes + .iter() + .position(|&b| !is_fmt_whitespace_byte(b)) + .unwrap_or(bytes.len()); + let trimmed = &bytes[start..]; WordSplit { opts, - string: trim_string, - length: string.len(), + bytes: trimmed, + length: trimmed.len(), position: 0, prev_punct: false, } } - fn is_punctuation(c: char) -> bool { - matches!(c, '!' | '.' | '?') + fn is_punctuation_byte(b: u8) -> bool { + matches!(b, b'!' | b'.' | b'?') } } pub struct WordInfo<'a> { - pub word: &'a str, + pub word: &'a [u8], pub word_start: usize, pub word_nchars: usize, pub before_tab: Option, @@ -579,7 +635,7 @@ impl<'a> Iterator for WordSplit<'a> { // find the start of the next word, and record if we find a tab character let (before_tab, after_tab, word_start) = - if let (b, a, Some(s)) = self.analyze_tabs(&self.string[old_position..]) { + if let (b, a, Some(s)) = self.analyze_tabs(&self.bytes[old_position..]) { (b, a, s + old_position) } else { self.position = self.length; @@ -590,17 +646,29 @@ impl<'a> Iterator for WordSplit<'a> { // note that this preserves the invariant that self.position // points to whitespace character OR end of string let mut word_nchars = 0; - self.position = match self.string[word_start..].find(|x: char| { - if is_fmt_whitespace(x) { - true + let mut idx = word_start; + let mut last_ascii = None; + while idx < self.length { + let (ch, consumed) = decode_char(self.bytes, idx); + let is_whitespace = ch + .filter(|c| c.is_ascii()) + .map_or(false, |c| is_fmt_whitespace(c)); + if is_whitespace { + break; + } + word_nchars += ch.map_or(1, char_width); + if let Some(ch) = ch { + if ch.is_ascii() { + last_ascii = Some(ch as u8); + } else { + last_ascii = None; + } } else { - word_nchars += char_width(x); - false + last_ascii = None; } - }) { - None => self.length, - Some(s) => s + word_start, - }; + idx += consumed; + } + self.position = idx; let word_start_relative = word_start - old_position; // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence. @@ -608,16 +676,14 @@ impl<'a> Iterator for WordSplit<'a> { self.prev_punct && (before_tab.is_some() || word_start_relative > 1); // now record whether this word ends in punctuation - self.prev_punct = match self.string[..self.position].chars().next_back() { - Some(ch) => WordSplit::is_punctuation(ch), - _ => panic!("fatal: expected word not to be empty"), - }; + let ends_punct = last_ascii.map_or(false, WordSplit::is_punctuation_byte); + self.prev_punct = ends_punct; let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform { - (&self.string[word_start..self.position], 0, None, 0) + (&self.bytes[word_start..self.position], 0, None, 0) } else { ( - &self.string[old_position..self.position], + &self.bytes[old_position..self.position], word_start_relative, before_tab, after_tab, @@ -631,7 +697,7 @@ impl<'a> Iterator for WordSplit<'a> { before_tab, after_tab, sentence_start: is_start_of_sentence, - ends_punct: self.prev_punct, + ends_punct, new_line, }) } From 4445c5acf9bd7be0273e0d2dc0965fc39f68e602 Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 20:32:42 +0900 Subject: [PATCH 5/7] refactor(fmt): replace Option::map_or(false, f) with is_some_and(f) - Updated indentation calculation in FileLines to use is_some_and for tab and character checks, avoiding unnecessary computations and improving code flow. - Changed punctuation checks in WordSplit iterator to use is_some_and for cleaner, more idiomatic Rust code. - This refactor enhances readability and leverages short-circuiting behavior. --- src/uu/fmt/src/parasplit.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index 920a8d14fb8..0d8b798327e 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -188,16 +188,18 @@ impl FileLines<'_> { if idx >= prefix_end && !is_fmt_whitespace_byte(byte) { indent_end = idx; break; - } else if byte == b'\t' { + } + + if byte == b'\t' { indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; idx += 1; continue; - } else { - let (ch, consumed) = decode_char(bytes, idx); - indent_len += ch.map_or(1, char_width); - idx += consumed; - continue; } + + let (ch, consumed) = decode_char(bytes, idx); + indent_len += ch.map_or(1, char_width); + idx += consumed; + continue; } if indent_end == bytes.len() { indent_end = idx; @@ -652,7 +654,7 @@ impl<'a> Iterator for WordSplit<'a> { let (ch, consumed) = decode_char(self.bytes, idx); let is_whitespace = ch .filter(|c| c.is_ascii()) - .map_or(false, |c| is_fmt_whitespace(c)); + .is_some_and(is_fmt_whitespace); if is_whitespace { break; } @@ -676,7 +678,7 @@ impl<'a> Iterator for WordSplit<'a> { self.prev_punct && (before_tab.is_some() || word_start_relative > 1); // now record whether this word ends in punctuation - let ends_punct = last_ascii.map_or(false, WordSplit::is_punctuation_byte); + let ends_punct = last_ascii.is_some_and(WordSplit::is_punctuation_byte); self.prev_punct = ends_punct; let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform { From c59f1bcfd1a938e032bfc0482d3351fc91f24f58 Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 20:34:01 +0900 Subject: [PATCH 6/7] style(fmt): compact whitespace check in WordSplit iterator to single line Refactored the is_whitespace assignment by combining chained method calls on one line for improved conciseness and readability. --- src/uu/fmt/src/parasplit.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/uu/fmt/src/parasplit.rs b/src/uu/fmt/src/parasplit.rs index 0d8b798327e..7a583bde6a8 100644 --- a/src/uu/fmt/src/parasplit.rs +++ b/src/uu/fmt/src/parasplit.rs @@ -652,9 +652,7 @@ impl<'a> Iterator for WordSplit<'a> { let mut last_ascii = None; while idx < self.length { let (ch, consumed) = decode_char(self.bytes, idx); - let is_whitespace = ch - .filter(|c| c.is_ascii()) - .is_some_and(is_fmt_whitespace); + let is_whitespace = ch.filter(|c| c.is_ascii()).is_some_and(is_fmt_whitespace); if is_whitespace { break; } From 6a313a42b3b5b167d9da8993487d16fbb0f7a69b Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 19 Nov 2025 20:56:15 +0900 Subject: [PATCH 7/7] fix(test_fmt): align invalid UTF-8 handling with GNU-compatible passthrough Updated test_fmt_invalid_utf8 to expect raw byte (\xA0) passthrough instead of replacement character (\u{FFFD}) for invalid UTF-8 input, ensuring GNU-compatible behavior in fmt. This fixes the test expectation to match actual output, avoiding lossy conversion. --- tests/by-util/test_fmt.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/by-util/test_fmt.rs b/tests/by-util/test_fmt.rs index 7ba051f1e5e..66d08817eda 100644 --- a/tests/by-util/test_fmt.rs +++ b/tests/by-util/test_fmt.rs @@ -405,11 +405,11 @@ fn test_fmt_invalid_utf8() { // Regression test for handling invalid UTF-8 input (e.g. ISO-8859-1) // fmt should not drop lines with invalid UTF-8. // \xA0 is non-breaking space in ISO-8859-1, but invalid in UTF-8. - // We expect it to be replaced by replacement character and treated as non-space. + // We expect GNU-compatible passthrough of the raw byte, not lossy replacement. let input = b"=\xA0="; new_ucmd!() .args(&["-s", "-w1"]) .pipe_in(input) .succeeds() - .stdout_is("=\u{FFFD}=\n"); + .stdout_is_bytes(b"=\xA0=\n"); }