diff -rc perl-5.8.0/patchlevel.h perl-5.8.0.patched/patchlevel.h *** perl-5.8.0/patchlevel.h Fri Jul 19 00:08:27 2002 --- perl-5.8.0.patched/patchlevel.h Thu Nov 15 17:18:17 2007 *************** *** 82 **** ! ,NULL --- 82,83 ---- ! ,"REGEXP0 - fix for UTF-8 recoding in regexps - CVE-2007-5116" ! ,NULL diff -rc perl-5.8.0/regcomp.c perl-5.8.0.patched/regcomp.c *** perl-5.8.0/regcomp.c Mon Jul 8 21:10:49 2002 --- perl-5.8.0.patched/regcomp.c Thu Nov 15 17:18:17 2007 *************** *** 125,131 **** I32 extralen; I32 seen_zerolen; I32 seen_evals; ! I32 utf8; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) --- 125,134 ---- I32 extralen; I32 seen_zerolen; I32 seen_evals; ! I32 utf8; /* whether the pattern is utf8 or not */ ! I32 orig_utf8; /* whether the pattern was originally in utf8 */ ! /* XXX use this for future optimisation of case ! * where pattern must be upgraded to utf8. */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) *************** *** 151,156 **** --- 154,160 ---- #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) + #define RExC_orig_utf8 (pRExC_state->orig_utf8) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ *************** *** 1737,1751 **** if (exp == NULL) FAIL("NULL regexp argument"); ! RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; - RExC_precomp = exp; DEBUG_r({ if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], ! (int)(xend - exp), RExC_precomp, PL_colors[1]); }); RExC_flags = pm->op_pmflags; RExC_sawback = 0; --- 1741,1757 ---- if (exp == NULL) FAIL("NULL regexp argument"); ! RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; DEBUG_r({ if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], ! (int)(xend - exp), exp, PL_colors[1]); }); + + redo_first_pass: + RExC_precomp = exp; RExC_flags = pm->op_pmflags; RExC_sawback = 0; *************** *** 1770,1775 **** --- 1776,1800 ---- if (reg(pRExC_state, 0, &flags) == NULL) { RExC_precomp = Nullch; return(NULL); + } + if (RExC_utf8 && !RExC_orig_utf8) { + /* It's possible to write a regexp in ascii that represents unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + XXX: somehow figure out how to make this less expensive... + -- dmq */ + STRLEN len = xend-exp; + DEBUG_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8; + SAVEFREEPV(exp); + goto redo_first_pass; } DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); diff -rc perl-5.8.0/t/op/pat.t perl-5.8.0.patched/t/op/pat.t *** perl-5.8.0/t/op/pat.t Mon Jul 1 15:42:19 2002 --- perl-5.8.0.patched/t/op/pat.t Thu Nov 15 17:19:14 2007 *************** *** 6,12 **** $| = 1; ! print "1..922\n"; BEGIN { chdir 't' if -d 't'; --- 6,12 ---- $| = 1; ! print "1..924\n"; BEGIN { chdir 't' if -d 't'; *************** *** 2900,2904 **** --- 2900,2913 ---- } } } + + { + use warnings; + my @w; + local $SIG{__WARN__}=sub{push @w,"@_"}; + my $c=qq(\x{DF}); + ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8"); + ok(@w==0, "No warnings"); + } $test = 923;