From e07bf6f24a7fa7438dd757d6e761c100dd920d68 Mon Sep 17 00:00:00 2001 From: Christian Duerr Date: Sun, 8 Oct 2023 16:57:46 +0200 Subject: Fix regex matches ending on multiline This fixes an issue where the reverse search for the regex start would truncate a character when ending on a newline, since it was omitting the EOI check in that case. This also fixes a separate issue which caused regexes which capture empty strings (e.g.: `.*`) to always report a match. This is a regression introduced in 73276b6. --- alacritty_terminal/src/term/search.rs | 253 +++++++++++++++++++++++++++------- 1 file changed, 205 insertions(+), 48 deletions(-) (limited to 'alacritty_terminal/src/term') diff --git a/alacritty_terminal/src/term/search.rs b/alacritty_terminal/src/term/search.rs index da994eec..1a3c7de8 100644 --- a/alacritty_terminal/src/term/search.rs +++ b/alacritty_terminal/src/term/search.rs @@ -8,7 +8,7 @@ use regex_automata::hybrid::dfa::{Builder, Cache, Config, DFA}; pub use regex_automata::hybrid::BuildError; use regex_automata::nfa::thompson::Config as ThompsonConfig; use regex_automata::util::syntax::Config as SyntaxConfig; -use regex_automata::{Anchored, Input}; +use regex_automata::{Anchored, Input, MatchKind}; use crate::grid::{BidirectionalIterator, Dimensions, GridIterator, Indexed}; use crate::index::{Boundary, Column, Direction, Point, Side}; @@ -23,8 +23,10 @@ pub type Match = RangeInclusive; /// Terminal regex search state. #[derive(Clone, Debug)] pub struct RegexSearch { - fdfa: LazyDfa, - rdfa: LazyDfa, + left_fdfa: LazyDfa, + left_rdfa: LazyDfa, + right_rdfa: LazyDfa, + right_fdfa: LazyDfa, } impl RegexSearch { @@ -39,24 +41,39 @@ impl RegexSearch { let config = Config::new().minimum_cache_clear_count(Some(3)).minimum_bytes_per_state(Some(10)); let max_size = config.get_cache_capacity(); - let mut thompson_config = ThompsonConfig::new().nfa_size_limit(Some(max_size)); - - // Create Regex DFA for left-to-right search. - let fdfa = Builder::new() - .configure(config.clone()) - .syntax(syntax_config) - .thompson(thompson_config.clone()) - .build(search)?; - - // Create Regex DFA for right-to-left search. - thompson_config = thompson_config.reverse(true); - let rdfa = Builder::new() - .configure(config) - .syntax(syntax_config) - .thompson(thompson_config) - .build(search)?; - - Ok(RegexSearch { fdfa: fdfa.into(), rdfa: rdfa.into() }) + let thompson_config = ThompsonConfig::new().nfa_size_limit(Some(max_size)); + + // Create DFAs to find start/end in left-to-right search. + let right_fdfa = LazyDfa::new( + search, + config.clone(), + syntax_config, + thompson_config.clone(), + Direction::Right, + false, + )?; + let right_rdfa = LazyDfa::new( + search, + config.clone(), + syntax_config, + thompson_config.clone(), + Direction::Left, + true, + )?; + + // Create DFAs to find start/end in right-to-left search. + let left_fdfa = LazyDfa::new( + search, + config.clone(), + syntax_config, + thompson_config.clone(), + Direction::Left, + false, + )?; + let left_rdfa = + LazyDfa::new(search, config, syntax_config, thompson_config, Direction::Right, true)?; + + Ok(RegexSearch { left_fdfa, left_rdfa, right_fdfa, right_rdfa }) } } @@ -67,10 +84,32 @@ struct LazyDfa { cache: Cache, } -impl From for LazyDfa { - fn from(dfa: DFA) -> Self { +impl LazyDfa { + fn new( + search: &str, + mut config: Config, + syntax: SyntaxConfig, + mut thompson: ThompsonConfig, + direction: Direction, + reverse: bool, + ) -> Result> { + thompson = match direction { + Direction::Left => thompson.reverse(true), + Direction::Right => thompson.reverse(false), + }; + config = if reverse { + config.match_kind(MatchKind::All) + } else { + config.match_kind(MatchKind::LeftmostFirst) + }; + + // Create the DFA. + let dfa = + Builder::new().configure(config).syntax(syntax).thompson(thompson).build(search)?; + let cache = dfa.create_cache(); - Self { dfa, cache } + + Ok(Self { dfa, cache }) } } @@ -190,9 +229,10 @@ impl Term { end: Point, ) -> Option { // Find start and end of match. - let match_start = self.regex_search(start, end, Direction::Left, false, &mut regex.rdfa)?; + let match_start = + self.regex_search(start, end, Direction::Left, false, &mut regex.left_fdfa)?; let match_end = - self.regex_search(match_start, start, Direction::Right, true, &mut regex.fdfa)?; + self.regex_search(match_start, start, Direction::Right, true, &mut regex.left_rdfa)?; Some(match_start..=match_end) } @@ -207,9 +247,10 @@ impl Term { end: Point, ) -> Option { // Find start and end of match. - let match_end = self.regex_search(start, end, Direction::Right, false, &mut regex.fdfa)?; + let match_end = + self.regex_search(start, end, Direction::Right, false, &mut regex.right_fdfa)?; let match_start = - self.regex_search(match_end, start, Direction::Left, true, &mut regex.rdfa)?; + self.regex_search(match_end, start, Direction::Left, true, &mut regex.right_rdfa)?; Some(match_start..=match_end) } @@ -272,8 +313,18 @@ impl Term { let mut point = iter.point(); let mut last_point = point; + let mut consumed_bytes = 0; + + // Reset the regex state to restart the search. + macro_rules! reset_state { + () => {{ + state = regex.dfa.start_state_forward(&mut regex.cache, &input)?; + consumed_bytes = 0; + regex_match = None; + }}; + } - loop { + 'outer: loop { // Convert char to array of bytes. let mut buf = [0; 4]; let utf8_len = c.encode_utf8(&mut buf).len(); @@ -287,27 +338,57 @@ impl Term { }; state = regex.dfa.next_state(&mut regex.cache, state, byte)?; + consumed_bytes += 1; - // Matches require one additional BYTE of lookahead, so we check the match state for - // the first byte of every new character to determine if the last character was a - // match. if i == 0 && state.is_match() { + // Matches require one additional BYTE of lookahead, so we check the match state + // for the first byte of every new character to determine if the last character + // was a match. regex_match = Some(last_point); + } else if state.is_dead() { + if consumed_bytes == 2 { + // Reset search if we found an empty match. + // + // With an unanchored search, a dead state only occurs after the end of a + // match has been found. While we want to abort after the first match has + // ended, we don't want empty matches since we cannot highlight them. + // + // So once we encounter an empty match, we reset our parser state and clear + // the match, effectively starting a new search one character farther than + // before. + // + // An empty match requires consuming `2` bytes, since the first byte will + // report the match for the empty string, while the second byte then + // reports the dead state indicating the first character isn't part of the + // match. + reset_state!(); + + // Retry this character if first byte caused failure. + // + // After finding an empty match, we want to advance the search start by one + // character. So if the first character has multiple bytes and the dead + // state isn't reached at `i == 0`, then we continue with the rest of the + // loop to advance the parser by one character. + if i == 0 { + continue 'outer; + } + } else { + // Abort on dead state. + break 'outer; + } } } - // Abort on dead states. - if state.is_dead() { - break; - } - // Stop once we've reached the target point. if point == end || done { // When reaching the end-of-input, we need to notify the parser that no look-ahead - // is possible and check if the current state is still a match. + // is possible and check for state changes. state = regex.dfa.next_eoi_state(&mut regex.cache, state)?; if state.is_match() { regex_match = Some(point); + } else if state.is_dead() && consumed_bytes == 1 { + // Ignore empty matches. + regex_match = None; } break; @@ -339,18 +420,19 @@ impl Term { if (last_point.column == last_column && point.column == Column(0) && !last_wrapped) || (last_point.column == Column(0) && point.column == last_column && !wrapped) { - match regex_match { - Some(_) => break, - None => { - // When reaching the end-of-input, we need to notify the parser that no - // look-ahead is possible and check if the current state is still a match. - state = regex.dfa.next_eoi_state(&mut regex.cache, state)?; - if state.is_match() { - regex_match = Some(last_point); - } + // When reaching the end-of-input, we need to notify the parser that no + // look-ahead is possible and check if the current state is still a match. + state = regex.dfa.next_eoi_state(&mut regex.cache, state)?; + if state.is_match() { + regex_match = Some(last_point); + } - state = regex.dfa.start_state_forward(&mut regex.cache, &input)?; + match regex_match { + // Stop if we found a non-empty match before the linebreak. + Some(_) if (!state.is_dead() || consumed_bytes > 1) && consumed_bytes != 0 => { + break; }, + _ => reset_state!(), } } @@ -858,6 +940,69 @@ mod tests { assert_eq!(term.regex_search_left(&mut regex, start, end), Some(match_start..=match_end)); } + #[test] + fn multiline() { + #[rustfmt::skip] + let term = mock_term("\ + test \r\n\ + test\ + "); + + const PATTERN: &str = "[a-z]*"; + let mut regex = RegexSearch::new(PATTERN).unwrap(); + let start = Point::new(Line(0), Column(0)); + let end = Point::new(Line(0), Column(3)); + let match_start = Point::new(Line(0), Column(0)); + assert_eq!(term.regex_search_right(&mut regex, start, end), Some(match_start..=end)); + + let mut regex = RegexSearch::new(PATTERN).unwrap(); + let start = Point::new(Line(0), Column(4)); + let end = Point::new(Line(0), Column(0)); + let match_start = Point::new(Line(1), Column(0)); + let match_end = Point::new(Line(1), Column(3)); + assert_eq!(term.regex_search_right(&mut regex, start, end), Some(match_start..=match_end)); + } + + #[test] + fn empty_match() { + #[rustfmt::skip] + let term = mock_term(" abc "); + + const PATTERN: &str = "[a-z]*"; + let mut regex = RegexSearch::new(PATTERN).unwrap(); + let start = Point::new(Line(0), Column(0)); + let end = Point::new(Line(0), Column(4)); + let match_start = Point::new(Line(0), Column(1)); + let match_end = Point::new(Line(0), Column(3)); + assert_eq!(term.regex_search_right(&mut regex, start, end), Some(match_start..=match_end)); + } + + #[test] + fn empty_match_multibyte() { + #[rustfmt::skip] + let term = mock_term(" ↑"); + + const PATTERN: &str = "[a-z]*"; + let mut regex = RegexSearch::new(PATTERN).unwrap(); + let start = Point::new(Line(0), Column(0)); + let end = Point::new(Line(0), Column(1)); + assert_eq!(term.regex_search_right(&mut regex, start, end), None); + } + + #[test] + fn empty_match_multiline() { + #[rustfmt::skip] + let term = mock_term("abc \nxxx"); + + const PATTERN: &str = "[a-z]*"; + let mut regex = RegexSearch::new(PATTERN).unwrap(); + let start = Point::new(Line(0), Column(3)); + let end = Point::new(Line(1), Column(2)); + let match_start = Point::new(Line(1), Column(0)); + let match_end = Point::new(Line(1), Column(2)); + assert_eq!(term.regex_search_right(&mut regex, start, end), Some(match_start..=match_end)); + } + #[test] fn leading_spacer() { #[rustfmt::skip] @@ -952,4 +1097,16 @@ mod tests { let end = Point::new(Line(0), Column(9999)); assert_eq!(term.regex_search_right(&mut regex, start, end), None); } + + #[test] + fn greed_is_good() { + #[rustfmt::skip] + let term = mock_term("https://github.com"); + + // Bottom to top. + let mut regex = RegexSearch::new("/github.com|https://github.com").unwrap(); + let start = Point::new(Line(0), Column(0)); + let end = Point::new(Line(0), Column(17)); + assert_eq!(term.regex_search_right(&mut regex, start, end), Some(start..=end)); + } } -- cgit v1.2.3-54-g00ecf