To: vim_dev@googlegroups.com Subject: Patch 7.3.1011 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 7.3.1011 Problem: New regexp engine is inefficient with multi-byte characters. Solution: Handle a character at a time instead of a byte at a time. Also make \Z partly work. Files: src/regexp_nfa.c, src/testdir/test95.in, src/testdir/test95.ok *** ../vim-7.3.1010/src/regexp_nfa.c 2013-05-24 20:25:28.000000000 +0200 --- src/regexp_nfa.c 2013-05-24 21:49:43.000000000 +0200 *************** *** 46,54 **** NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ NFA_START_INVISIBLE, NFA_END_INVISIBLE, - NFA_MULTIBYTE, /* Next nodes in NFA are part of the same - multibyte char */ - NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */ NFA_COMPOSING, /* Next nodes in NFA are part of the composing multibyte char */ NFA_END_COMPOSING, /* End of a composing char in the NFA */ --- 46,51 ---- *************** *** 195,220 **** *post_ptr++ = c; \ } while (0) - #define EMIT_MBYTE(c) \ - len = (*mb_char2bytes)(c, buf); \ - EMIT(buf[0]); \ - for (i = 1; i < len; i++) \ - { \ - EMIT(buf[i]); \ - EMIT(NFA_CONCAT); \ - } \ - EMIT(NFA_MULTIBYTE); - - #define EMIT_COMPOSING_UTF(input) \ - len = utfc_ptr2len(input); \ - EMIT(input[0]); \ - for (i = 1; i < len; i++) \ - { \ - EMIT(input[i]); \ - EMIT(NFA_CONCAT); \ - } \ - EMIT(NFA_COMPOSING); - /* * Initialize internal variables before NFA compilation. * Return OK on success, FAIL otherwise. --- 192,197 ---- *************** *** 611,618 **** #ifdef FEAT_MBYTE char_u *old_regparse = regparse; int clen; - int len; - static char_u buf[30]; int i; #endif int extra = 0; --- 588,593 ---- *************** *** 845,858 **** return FAIL; c = coll_get_char(); ! #ifdef FEAT_MBYTE ! if ((*mb_char2len)(c) > 1) ! { ! EMIT_MBYTE(c); ! } ! else ! #endif ! EMIT(c); break; /* Catch \%^ and \%$ regardless of where they appear in the --- 820,826 ---- return FAIL; c = coll_get_char(); ! EMIT(c); break; /* Catch \%^ and \%$ regardless of where they appear in the *************** *** 1135,1146 **** * skip it. */ for (c = startc + 1; c <= endc; c++) { ! if ((*mb_char2len)(c) > 1) ! { ! EMIT_MBYTE(c); ! } ! else ! EMIT(c); TRY_NEG(); EMIT_GLUE(); } --- 1103,1109 ---- * skip it. */ for (c = startc + 1; c <= endc; c++) { ! EMIT(c); TRY_NEG(); EMIT_GLUE(); } *************** *** 1187,1200 **** if (got_coll_char == TRUE && startc == 0) EMIT(0x0a); else ! #ifdef FEAT_MBYTE ! if ((*mb_char2len)(startc) > 1) ! { ! EMIT_MBYTE(startc); ! } ! else ! #endif ! EMIT(startc); TRY_NEG(); EMIT_GLUE(); } --- 1150,1156 ---- if (got_coll_char == TRUE && startc == 0) EMIT(0x0a); else ! EMIT(startc); TRY_NEG(); EMIT_GLUE(); } *************** *** 1242,1271 **** int plen; nfa_do_multibyte: ! /* length of current char, with composing chars, ! * from pointer */ ! plen = (*mb_ptr2len)(old_regparse); ! if (enc_utf8 && clen != plen) ! { ! /* A composing character is always handled as a ! * separate atom, surrounded by NFA_COMPOSING and ! * NFA_END_COMPOSING. Note that right now we are * building the postfix form, not the NFA itself; * a composing char could be: a, b, c, NFA_COMPOSING ! * where 'a', 'b', 'c' are chars with codes > 256. ! */ ! EMIT_COMPOSING_UTF(old_regparse); regparse = old_regparse + plen; } else - /* A multi-byte character is always handled as a - * separate atom, surrounded by NFA_MULTIBYTE and - * NFA_END_MULTIBYTE */ - if (plen > 1) - { - EMIT_MBYTE(c); - } - else #endif { c = no_Magic(c); --- 1198,1227 ---- int plen; nfa_do_multibyte: ! /* Length of current char with composing chars. */ ! if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse))) ! { ! /* A base character plus composing characters. ! * This requires creating a separate atom as if enclosing ! * the characters in (), where NFA_COMPOSING is the ( and ! * NFA_END_COMPOSING is the ). Note that right now we are * building the postfix form, not the NFA itself; * a composing char could be: a, b, c, NFA_COMPOSING ! * where 'b' and 'c' are chars with codes > 256. */ ! i = 0; ! for (;;) ! { ! EMIT(c); ! if (i > 0) ! EMIT(NFA_CONCAT); ! if (i += utf_char2len(c) >= plen) ! break; ! c = utf_ptr2char(old_regparse + i); ! } ! EMIT(NFA_COMPOSING); regparse = old_regparse + plen; } else #endif { c = no_Magic(c); *************** *** 1702,1710 **** case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break; case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break; - case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break; - case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break; - case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break; case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break; --- 1658,1663 ---- *************** *** 2194,2200 **** } e1 = POP(); e1.start->negated = TRUE; ! if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING) e1.start->out1->negated = TRUE; PUSH(e1); break; --- 2147,2153 ---- } e1 = POP(); e1.start->negated = TRUE; ! if (e1.start->c == NFA_COMPOSING) e1.start->out1->negated = TRUE; PUSH(e1); break; *************** *** 2311,2316 **** --- 2264,2279 ---- PUSH(frag(s, list1(&s1->out))); break; + case NFA_COMPOSING: /* char with composing char */ + #if 0 + /* TODO */ + if (regflags & RF_ICOMBINE) + { + goto normalchar; + } + #endif + /* FALLTHROUGH */ + case NFA_MOPEN + 0: /* Submatch */ case NFA_MOPEN + 1: case NFA_MOPEN + 2: *************** *** 2322,2329 **** case NFA_MOPEN + 8: case NFA_MOPEN + 9: case NFA_NOPEN: /* \%( "Invisible Submatch" */ - case NFA_MULTIBYTE: /* mbyte char */ - case NFA_COMPOSING: /* composing char */ if (nfa_calc_size == TRUE) { nstate += 2; --- 2285,2290 ---- *************** *** 2336,2344 **** case NFA_NOPEN: mclose = NFA_NCLOSE; break; - case NFA_MULTIBYTE: - mclose = NFA_END_MULTIBYTE; - break; case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break; --- 2297,2302 ---- *************** *** 2377,2385 **** goto theend; patch(e.out, s1); ! if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING) ! /* MULTIBYTE->out1 = END_MULTIBYTE ! * COMPOSING->out1 = END_COMPOSING */ patch(list1(&s->out1), s1); PUSH(frag(s, list1(&s1->out))); --- 2335,2342 ---- goto theend; patch(e.out, s1); ! if (mopen == NFA_COMPOSING) ! /* COMPOSING->out1 = END_COMPOSING */ patch(list1(&s->out1), s1); PUSH(frag(s, list1(&s1->out))); *************** *** 2540,2556 **** case NFA_COMPOSING: /* nfa_regmatch() will match all the bytes of this composing char. */ break; - - case NFA_MULTIBYTE: - /* nfa_regmatch() will match all the bytes of this multibyte char. */ - break; #endif - case NFA_END_MULTIBYTE: - /* Successfully matched this mbyte char */ - addstate(l, state->out, m, off, lid, match); - break; - case NFA_NOPEN: case NFA_NCLOSE: addstate(l, state->out, m, off, lid, match); --- 2497,2504 ---- *************** *** 2841,2847 **** regsub_T *submatch; regsub_T *m; { ! int c = -1; int n; int i = 0; int result; --- 2789,2795 ---- regsub_T *submatch; regsub_T *m; { ! int c; int n; int i = 0; int result; *************** *** 2859,2865 **** List *listtbl[2][2]; List *ll; int listid = 1; - int endnode; List *thislist; List *nextlist; List *neglist; --- 2807,2812 ---- *************** *** 3190,3222 **** break; } ! case NFA_MULTIBYTE: case NFA_COMPOSING: ! endnode = t->state->c + 1; result = OK; sta = t->state->out; ! len = 1; ! while (sta->c != endnode && len <= n) { ! if (reginput[len-1] != sta->c) ! { ! result = FAIL; break; ! } ! len++; sta = sta->out; } /* if input char length doesn't match regexp char length */ ! if (len -1 < n || sta->c != endnode) result = FAIL; ! end = t->state->out1; /* NFA_END_MULTIBYTE or ! NFA_END_COMPOSING */ /* If \Z was present, then ignore composing characters */ ! if (ireg_icombine && endnode == NFA_END_COMPOSING) result = 1 ^ sta->negated; ADD_POS_NEG_STATE(end); break; case NFA_NEWL: if (!reg_line_lbr && REG_MULTI --- 3137,3171 ---- break; } ! #ifdef FEAT_MBYTE case NFA_COMPOSING: ! { ! int mc = c; ! result = OK; sta = t->state->out; ! len = 0; ! while (sta->c != NFA_END_COMPOSING && len < n) { ! if (len > 0) ! mc = mb_ptr2char(reginput + len); ! if (mc != sta->c) break; ! len += mb_char2len(mc); sta = sta->out; } /* if input char length doesn't match regexp char length */ ! if (len < n || sta->c != NFA_END_COMPOSING) result = FAIL; ! end = t->state->out1; /* NFA_END_COMPOSING */ /* If \Z was present, then ignore composing characters */ ! if (ireg_icombine) result = 1 ^ sta->negated; ADD_POS_NEG_STATE(end); break; + } + #endif case NFA_NEWL: if (!reg_line_lbr && REG_MULTI *************** *** 3425,3430 **** --- 3374,3387 ---- if (!result) result = ireg_ic == TRUE && MB_TOLOWER(t->state->c) == MB_TOLOWER(c); + #ifdef FEAT_MBYTE + /* If there is a composing character which is not being + * ignored there can be no match. Match with composing + * character uses NFA_COMPOSING above. */ + if (result && enc_utf8 && !ireg_icombine + && n != utf_char2len(c)) + result = FALSE; + #endif ADD_POS_NEG_STATE(t->state); break; } *** ../vim-7.3.1010/src/testdir/test95.in 2013-05-24 20:25:28.000000000 +0200 --- src/testdir/test95.in 2013-05-24 20:45:08.000000000 +0200 *************** *** 35,40 **** --- 35,44 ---- :call add(tl, ['\f\+', '&*Ÿfname ', 'fname']) :call add(tl, ['\%#=1\f\+', '&*Ÿfname ', 'fname']) + :"""" Test composing character matching + :call add(tl, ['.ม', 'xม่x yมy', 'yม']) + :call add(tl, ['.ม่', 'xม่x yมy', 'xม่']) + :"""" Test \Z :call add(tl, ['ú\Z', 'x']) *** ../vim-7.3.1010/src/testdir/test95.ok 2013-05-24 20:25:28.000000000 +0200 --- src/testdir/test95.ok 2013-05-24 20:44:41.000000000 +0200 *************** *** 9,13 **** --- 9,15 ---- OK - \%#=1\i\+ OK - \f\+ OK - \%#=1\f\+ + OK - .ม + OK - .ม่ OK - ú\Z OK - [^[=a=]]\+ *** ../vim-7.3.1010/src/version.c 2013-05-24 20:25:28.000000000 +0200 --- src/version.c 2013-05-24 21:56:02.000000000 +0200 *************** *** 730,731 **** --- 730,733 ---- { /* Add new patch number below this line */ + /**/ + 1011, /**/ -- If you had to identify, in one word, the reason why the human race has not achieved, and never will achieve, its full potential, that word would be "meetings." /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///