replace problem in mb_ereg(i)_replace
今まで放置していたのですが、ふと思い出したので。
mb_ereg(i)_replace関数で微妙な挙動に遭遇したのですが(皆さん)どうやって回避しているのだろうか。。。教えて欲しいっす!(>_<)
% cd /usr/local/src % gzip -dc ./php-4.4.8.tar.gz | gtar xf - % cd ./php-4.4.8 % ./configure \ --disable-all \ --without-mysql \ --enable-mbstring \ --enable-debug % make % cp sapi/cli/php /usr/local/src/php-4.4.8-cli1 % cd /usr/local/src % gzip -dc ./php-5.2.5.tar.gz | gtar xf - % cd ./php-5.2.5 % ./configure \ --disable-all \ --without-iconv \ --enable-mbstring \ --enable-debug % make % cp sapi/cli/php /usr/local/src/php-5.2.5-cli1 % cd /usr/local/src % cat ./replace.php <?php $STR = "price is <%price%>."; $BF = "<%price%>"; echo "----------------------------------------------".PHP_EOL; var_dump( PHP_VERSION ); echo "----------------------------------------------".PHP_EOL; $AF = "\\0"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\1"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\2"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\3"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\4"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\5"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\6"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\7"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\8"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); echo "----------------------------------------------".PHP_EOL; $AF = "\\9"; var_dump( mb_ereg_replace( $BF, $AF, $STR ) ); var_dump( mb_eregi_replace( $BF, $AF, $STR ) ); ?> % ./php-4.4.8-cli1 ./replace.php ---------------------------------------------- string(5) "4.4.8" ---------------------------------------------- string(19) "price is <%price%>." string(19) "price is <%price%>." ---------------------------------------------- string(12) "price is \1." string(12) "price is \1." ---------------------------------------------- string(12) "price is \2." string(12) "price is \2." ---------------------------------------------- string(12) "price is \3." string(12) "price is \3." ---------------------------------------------- string(12) "price is \4." string(12) "price is \4." ---------------------------------------------- string(12) "price is \5." string(12) "price is \5." ---------------------------------------------- string(12) "price is \6." string(12) "price is \6." ---------------------------------------------- string(12) "price is \7." string(12) "price is \7." ---------------------------------------------- string(12) "price is \8." string(12) "price is \8." ---------------------------------------------- string(12) "price is \9." string(12) "price is \9." % ./php-5.2.5-cli1 ./replace.php ---------------------------------------------- string(5) "5.2.5" ---------------------------------------------- string(19) "price is <%price%>." string(19) "price is <%price%>." ---------------------------------------------- string(12) "price is \1." string(12) "price is \1." ---------------------------------------------- string(12) "price is \2." string(12) "price is \2." ---------------------------------------------- string(12) "price is \3." string(12) "price is \3." ---------------------------------------------- string(12) "price is \4." string(12) "price is \4." ---------------------------------------------- string(12) "price is \5." string(12) "price is \5." ---------------------------------------------- string(12) "price is \6." string(12) "price is \6." ---------------------------------------------- string(12) "price is \7." string(12) "price is \7." ---------------------------------------------- string(12) "price is \8." string(12) "price is \8." ---------------------------------------------- string(12) "price is \9." string(12) "price is \9."
「\0」だけ置換されてないし。。。_| ̄|○
実装がどうなっているか確認。
バージョンに限らず発生しているという事は、この部分の制御は正規表現エンジン中ではなく、PHP内部で制御しているのでは。。。と思っていたら見事にビンゴ。
% cd /usr/local/src % less -N ./php-4.4.8/ext/mbstring/php_mbregex.c 〜 省略 〜 519 /* copy replacement and backrefs */ 520 /* FIXME: this code (\\digit replacement) is not mbyte aware! */ 521 i = 0; 522 p = replace; 523 while (i < replace_len) { 524 n = -1; 525 if (p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { /* ※1 */ 526 n = p[1] - '0'; /* ※1 */ 527 } /* ※1 */ 528 if (n >= 0 && n < regs.num_regs) { /* ※2 */ 529 if (regs.beg[n] >= 0 && regs.beg[n] < regs.end[n] && regs.end[n] <= string_len) { 530 _php_mb_regex_strbuf_ncat(pdevice, (const unsigned char *)&string[regs.beg[n]], regs.end[n] - regs.beg[n]); 531 } 532 p += 2; 533 i += 2; 534 } else { 535 _php_mb_regex_strbuf_ncat(pdevice, (const unsigned char *)p, 1); 536 p++; 537 i++; 538 } 539 } 〜 省略 〜 % less -N ./php-5.2.5/ext/mbstring/php_mbregex.c 〜 省略 〜 713 /* copy replacement and backrefs */ 714 i = 0; 715 p = replace; 716 while (i < replace_len) { 717 int fwd = (int) php_mb_mbchar_bytes_ex(p, enc); 718 n = -1; 719 if ((replace_len - i) >= 2 && fwd == 1 && 720 p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { /* ※1 */ 721 n = p[1] - '0'; /* ※1 */ 722 } /* ※1 */ 723 if (n >= 0 && n < regs->num_regs) { /* ※2 */ 724 if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && regs->end[n] <= string_len) { 725 smart_str_appendl(pbuf, string + regs->beg[n], regs->end[n] - regs->beg[n]); 726 } 727 p += 2; 728 i += 2; 729 } else { 730 smart_str_appendl(pbuf, p, fwd); 731 p += fwd; 732 i += fwd; 733 } 734 } 〜 省略 〜
「\」の次が「0」「1」「2」「3」「4」「5」「6」「7」「8」「9」であった場合は該当処理(※1)が実行される。。。と。
で、「regs.num_regs」が「1」になっていたので該当処理(※2)が実行される。。。と。「\0」の時だけ。_| ̄|○
危険を承知で該当処理(※2)に入らないよう修正してみる。暫定処置なのでパッチは手抜き。
% cd /usr/local/src % cat ./php-4.4.8-php_mbregex.c.diff --- php-4.4.8,orig/ext/mbstring/php_mbregex.c 2006-01-01 22:46:54.000000000 +0900 +++ php-4.4.8/ext/mbstring/php_mbregex.c 2008-03-31 10:12:08.000000000 +0900 @@ -523,7 +523,6 @@ while (i < replace_len) { n = -1; if (p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { - n = p[1] - '0'; } if (n >= 0 && n < regs.num_regs) { if (regs.beg[n] >= 0 && regs.beg[n] < regs.end[n] && regs.end[n] <= string_len) { % patch -p0 < ./php-4.4.8-php_mbregex.c.diff patching file php-4.4.8/ext/mbstring/php_mbregex.c % cd ./php-4.4.8 % make clean % make % cp sapi/cli/php /usr/local/src/php-4.4.8-cli2 % cd /usr/local/src % cat ./php-5.2.5-php_mbregex.c.diff --- php-5.2.5,orig/ext/mbstring/php_mbregex.c 2007-01-12 07:23:20.000000000 +0900 +++ php-5.2.5/ext/mbstring/php_mbregex.c 2008-03-31 10:12:28.000000000 +0900 @@ -718,7 +718,6 @@ n = -1; if ((replace_len - i) >= 2 && fwd == 1 && p[0] == '\\' && p[1] >= '0' && p[1] <= '9') { - n = p[1] - '0'; } if (n >= 0 && n < regs->num_regs) { if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] && regs->end[n] <= string_len) { % patch -p0 < ./php-5.2.5-php_mbregex.c.diff patching file php-5.2.5/ext/mbstring/php_mbregex.c % cd ./php-5.2.5 % make clean % make % cp sapi/cli/php /usr/local/src/php-5.2.5-cli2 % cd /usr/local/src % ./php-4.4.8-cli2 ./replace.php ---------------------------------------------- string(5) "4.4.8" ---------------------------------------------- string(12) "price is \0." string(12) "price is \0." ---------------------------------------------- string(12) "price is \1." string(12) "price is \1." ---------------------------------------------- string(12) "price is \2." string(12) "price is \2." ---------------------------------------------- string(12) "price is \3." string(12) "price is \3." ---------------------------------------------- string(12) "price is \4." string(12) "price is \4." ---------------------------------------------- string(12) "price is \5." string(12) "price is \5." ---------------------------------------------- string(12) "price is \6." string(12) "price is \6." ---------------------------------------------- string(12) "price is \7." string(12) "price is \7." ---------------------------------------------- string(12) "price is \8." string(12) "price is \8." ---------------------------------------------- string(12) "price is \9." string(12) "price is \9." % ./php-5.2.5-cli2 ./replace.php ---------------------------------------------- string(5) "5.2.5" ---------------------------------------------- string(12) "price is \0." string(12) "price is \0." ---------------------------------------------- string(12) "price is \1." string(12) "price is \1." ---------------------------------------------- string(12) "price is \2." string(12) "price is \2." ---------------------------------------------- string(12) "price is \3." string(12) "price is \3." ---------------------------------------------- string(12) "price is \4." string(12) "price is \4." ---------------------------------------------- string(12) "price is \5." string(12) "price is \5." ---------------------------------------------- string(12) "price is \6." string(12) "price is \6." ---------------------------------------------- string(12) "price is \7." string(12) "price is \7." ---------------------------------------------- string(12) "price is \8." string(12) "price is \8." ---------------------------------------------- string(12) "price is \9." string(12) "price is \9."
とりあえず置換されるようになった。影響範囲(副作用)は不明。
オリジナルの実装者ではないので詳細は不明なのですが、こういう実装になっているのは何か理由があるのかしら。なんか、ありそう。
正規表現に詳しくないのでアレなんですが、replacementの「\0」って何か特別な意味を持っているのだろうか。
それにしても、そろそろmb_str_replace関数が欲しいなぁ。。。と思う、今日この頃。