✨ Añade normalización de cadenas ASCII

2026-01-04 13:00:16 +01:00 · 2026-01-04 13:00:16 +01:00 · e9d326cd99
commit e9d326cd99
parent 41d5091348
4 changed files with 437 additions and 53 deletions
--- a/src/util.rs
+++ b/src/util.rs
@ -2,6 +2,7 @@

 use crate::trace;

+use std::borrow::Cow;
 use std::env;
 use std::io;
 use std::path::{Path, PathBuf};
@ -24,6 +25,109 @@ pub use pagetop_minimal::paste;

 // **< FUNCIONES ÚTILES >***************************************************************************

+/// Errores posibles al normalizar una cadena ASCII con [`normalize_ascii()`].
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum NormalizeAsciiError {
+    /// La entrada está vacía (`""`).
+    IsEmpty,
+    /// La entrada quedó vacía tras recortar separadores ASCII al inicio/fin.
+    EmptyAfterTrimming,
+    /// La entrada contiene al menos un byte no ASCII (>= 0x80).
+    NonAscii,
+}
+
+/// Normaliza una cadena ASCII con uno o varios tokens separados.
+///
+/// Los *separadores* son caracteres `is_ascii_whitespace()` como `' '`, `'\t'`, `'\n'` o `'\r'`.
+///
+/// Reglas:
+///
+/// - Devuelve `Err(NormalizeAsciiError::IsEmpty)` si la entrada es `""`.
+/// - Devuelve `Err(NormalizeAsciiError::NonAscii)` si contiene algún byte no ASCII (`>= 0x80`).
+/// - Devuelve `Err(NormalizeAsciiError::EmptyAfterTrimming)` si después de recortar separadores al
+///   inicio/fin, la entrada queda vacía.
+/// - Sustituye cualquier secuencia de separadores por un único espacio `' '`.
+/// - El resultado queda siempre en minúsculas.
+///
+/// Intenta devolver siempre `Cow::Borrowed` para no reservar memoria, y `Cow::Owned` sólo si ha
+/// tenido que aplicar cambios para normalizar.
+///
+/// # Ejemplo
+///
+/// ```rust
+/// # use pagetop::util;
+/// assert_eq!(util::normalize_ascii("  Foo\tBAR  CLi\r\n").unwrap().as_ref(), "foo bar cli");
+/// ```
+pub fn normalize_ascii<'a>(input: &'a str) -> Result<Cow<'a, str>, NormalizeAsciiError> {
+    let bytes = input.as_bytes();
+    if bytes.is_empty() {
+        return Err(NormalizeAsciiError::IsEmpty);
+    }
+
+    let mut start = 0usize;
+    let mut end = 0usize;
+
+    let mut needs_alloc = false;
+    let mut needs_alloc_ws = false;
+    let mut has_content = false;
+    let mut prev_sep = false;
+
+    for (pos, &b) in bytes.iter().enumerate() {
+        if !b.is_ascii() {
+            return Err(NormalizeAsciiError::NonAscii);
+        }
+        if b.is_ascii_whitespace() {
+            if has_content {
+                if b != b' ' || prev_sep {
+                    needs_alloc_ws = true;
+                }
+                prev_sep = true;
+            }
+        } else {
+            if needs_alloc_ws {
+                needs_alloc = true;
+                needs_alloc_ws = false;
+            }
+            if b.is_ascii_uppercase() {
+                needs_alloc = true;
+            }
+            prev_sep = false;
+            if !has_content {
+                start = pos;
+                has_content = true;
+            }
+            end = pos + 1;
+        }
+    }
+
+    if !has_content {
+        return Err(NormalizeAsciiError::EmptyAfterTrimming);
+    }
+
+    let slice = &input[start..end];
+
+    if !needs_alloc {
+        return Ok(Cow::Borrowed(slice));
+    }
+
+    let mut output = String::with_capacity(slice.len());
+    let mut prev_sep = true;
+
+    for &b in slice.as_bytes() {
+        if b.is_ascii_whitespace() {
+            if !prev_sep {
+                output.push(' ');
+                prev_sep = true;
+            }
+        } else {
+            output.push(b.to_ascii_lowercase() as char);
+            prev_sep = false;
+        }
+    }
+
+    Ok(Cow::Owned(output))
+}
+
 /// Resuelve y valida la ruta de un directorio existente, devolviendo una ruta absoluta.
 ///
 /// - Si la ruta es relativa, se resuelve respecto al directorio del proyecto según la variable de