1007 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			1007 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | /*************************************************
 | ||
|  | *      Perl-Compatible Regular Expressions       * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* PCRE is a library of functions to support regular expressions whose syntax
 | ||
|  | and semantics are as close as possible to those of the Perl 5 language. | ||
|  | 
 | ||
|  |                        Written by Philip Hazel | ||
|  |      Original API code Copyright (c) 1997-2012 University of Cambridge | ||
|  |           New API code Copyright (c) 2016-2021 University of Cambridge | ||
|  | 
 | ||
|  | ----------------------------------------------------------------------------- | ||
|  | Redistribution and use in source and binary forms, with or without | ||
|  | modification, are permitted provided that the following conditions are met: | ||
|  | 
 | ||
|  |     * Redistributions of source code must retain the above copyright notice, | ||
|  |       this list of conditions and the following disclaimer. | ||
|  | 
 | ||
|  |     * Redistributions in binary form must reproduce the above copyright | ||
|  |       notice, this list of conditions and the following disclaimer in the | ||
|  |       documentation and/or other materials provided with the distribution. | ||
|  | 
 | ||
|  |     * Neither the name of the University of Cambridge nor the names of its | ||
|  |       contributors may be used to endorse or promote products derived from | ||
|  |       this software without specific prior written permission. | ||
|  | 
 | ||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
|  | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
|  | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
|  | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | ||
|  | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
|  | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
|  | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
|  | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
|  | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
|  | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
|  | POSSIBILITY OF SUCH DAMAGE. | ||
|  | ----------------------------------------------------------------------------- | ||
|  | */ | ||
|  | 
 | ||
|  | 
 | ||
|  | #ifdef HAVE_CONFIG_H
 | ||
|  | #include "config.h"
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #include "pcre2_internal.h"
 | ||
|  | 
 | ||
|  | #define PTR_STACK_SIZE 20
 | ||
|  | 
 | ||
|  | #define SUBSTITUTE_OPTIONS \
 | ||
|  |   (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ | ||
|  |    PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ | ||
|  |    PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ | ||
|  |    PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *           Find end of substitute text          * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* In extended mode, we recognize ${name:+set text:unset text} and similar
 | ||
|  | constructions. This requires the identification of unescaped : and } | ||
|  | characters. This function scans for such. It must deal with nested ${ | ||
|  | constructions. The pointer to the text is updated, either to the required end | ||
|  | character, or to where an error was detected. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   code      points to the compiled expression (for options) | ||
|  |   ptrptr    points to the pointer to the start of the text (updated) | ||
|  |   ptrend    end of the whole string | ||
|  |   last      TRUE if the last expected string (only } recognized) | ||
|  | 
 | ||
|  | Returns:    0 on success | ||
|  |             negative error code on failure | ||
|  | */ | ||
|  | 
 | ||
|  | static int | ||
|  | find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, | ||
|  |   BOOL last) | ||
|  | { | ||
|  | int rc = 0; | ||
|  | uint32_t nestlevel = 0; | ||
|  | BOOL literal = FALSE; | ||
|  | PCRE2_SPTR ptr = *ptrptr; | ||
|  | 
 | ||
|  | for (; ptr < ptrend; ptr++) | ||
|  |   { | ||
|  |   if (literal) | ||
|  |     { | ||
|  |     if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E) | ||
|  |       { | ||
|  |       literal = FALSE; | ||
|  |       ptr += 1; | ||
|  |       } | ||
|  |     } | ||
|  | 
 | ||
|  |   else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) | ||
|  |     { | ||
|  |     if (nestlevel == 0) goto EXIT; | ||
|  |     nestlevel--; | ||
|  |     } | ||
|  | 
 | ||
|  |   else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT; | ||
|  | 
 | ||
|  |   else if (*ptr == CHAR_DOLLAR_SIGN) | ||
|  |     { | ||
|  |     if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET) | ||
|  |       { | ||
|  |       nestlevel++; | ||
|  |       ptr += 1; | ||
|  |       } | ||
|  |     } | ||
|  | 
 | ||
|  |   else if (*ptr == CHAR_BACKSLASH) | ||
|  |     { | ||
|  |     int erc; | ||
|  |     int errorcode; | ||
|  |     uint32_t ch; | ||
|  | 
 | ||
|  |     if (ptr < ptrend - 1) switch (ptr[1]) | ||
|  |       { | ||
|  |       case CHAR_L: | ||
|  |       case CHAR_l: | ||
|  |       case CHAR_U: | ||
|  |       case CHAR_u: | ||
|  |       ptr += 1; | ||
|  |       continue; | ||
|  |       } | ||
|  | 
 | ||
|  |     ptr += 1;  /* Must point after \ */ | ||
|  |     erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode, | ||
|  |       code->overall_options, code->extra_options, FALSE, NULL); | ||
|  |     ptr -= 1;  /* Back to last code unit of escape */ | ||
|  |     if (errorcode != 0) | ||
|  |       { | ||
|  |       rc = errorcode; | ||
|  |       goto EXIT; | ||
|  |       } | ||
|  | 
 | ||
|  |     switch(erc) | ||
|  |       { | ||
|  |       case 0:      /* Data character */ | ||
|  |       case ESC_E:  /* Isolated \E is ignored */ | ||
|  |       break; | ||
|  | 
 | ||
|  |       case ESC_Q: | ||
|  |       literal = TRUE; | ||
|  |       break; | ||
|  | 
 | ||
|  |       default: | ||
|  |       rc = PCRE2_ERROR_BADREPESCAPE; | ||
|  |       goto EXIT; | ||
|  |       } | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  | rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */ | ||
|  | 
 | ||
|  | EXIT: | ||
|  | *ptrptr = ptr; | ||
|  | return rc; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | /*************************************************
 | ||
|  | *              Match and substitute              * | ||
|  | *************************************************/ | ||
|  | 
 | ||
|  | /* This function applies a compiled re to a subject string and creates a new
 | ||
|  | string with substitutions. The first 7 arguments are the same as for | ||
|  | pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED. | ||
|  | 
 | ||
|  | Arguments: | ||
|  |   code            points to the compiled expression | ||
|  |   subject         points to the subject string | ||
|  |   length          length of subject string (may contain binary zeros) | ||
|  |   start_offset    where to start in the subject string | ||
|  |   options         option bits | ||
|  |   match_data      points to a match_data block, or is NULL | ||
|  |   context         points a PCRE2 context | ||
|  |   replacement     points to the replacement string | ||
|  |   rlength         length of replacement string | ||
|  |   buffer          where to put the substituted string | ||
|  |   blength         points to length of buffer; updated to length of string | ||
|  | 
 | ||
|  | Returns:          >= 0 number of substitutions made | ||
|  |                   < 0 an error code | ||
|  |                   PCRE2_ERROR_BADREPLACEMENT means invalid use of $ | ||
|  | */ | ||
|  | 
 | ||
|  | /* This macro checks for space in the buffer before copying into it. On
 | ||
|  | overflow, either give an error immediately, or keep on, accumulating the | ||
|  | length. */ | ||
|  | 
 | ||
|  | #define CHECKMEMCPY(from,length) \
 | ||
|  |   { \ | ||
|  |   if (!overflowed && lengthleft < length) \ | ||
|  |     { \ | ||
|  |     if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ | ||
|  |     overflowed = TRUE; \ | ||
|  |     extra_needed = length - lengthleft; \ | ||
|  |     } \ | ||
|  |   else if (overflowed) \ | ||
|  |     { \ | ||
|  |     extra_needed += length; \ | ||
|  |     }  \ | ||
|  |   else \ | ||
|  |     {  \ | ||
|  |     memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ | ||
|  |     buff_offset += length; \ | ||
|  |     lengthleft -= length; \ | ||
|  |     } \ | ||
|  |   } | ||
|  | 
 | ||
|  | /* Here's the function */ | ||
|  | 
 | ||
|  | PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION | ||
|  | pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, | ||
|  |   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, | ||
|  |   pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength, | ||
|  |   PCRE2_UCHAR *buffer, PCRE2_SIZE *blength) | ||
|  | { | ||
|  | int rc; | ||
|  | int subs; | ||
|  | int forcecase = 0; | ||
|  | int forcecasereset = 0; | ||
|  | uint32_t ovector_count; | ||
|  | uint32_t goptions = 0; | ||
|  | uint32_t suboptions; | ||
|  | pcre2_match_data *internal_match_data = NULL; | ||
|  | BOOL escaped_literal = FALSE; | ||
|  | BOOL overflowed = FALSE; | ||
|  | BOOL use_existing_match; | ||
|  | BOOL replacement_only; | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | BOOL utf = (code->overall_options & PCRE2_UTF) != 0; | ||
|  | BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; | ||
|  | #endif
 | ||
|  | PCRE2_UCHAR temp[6]; | ||
|  | PCRE2_SPTR ptr; | ||
|  | PCRE2_SPTR repend; | ||
|  | PCRE2_SIZE extra_needed = 0; | ||
|  | PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; | ||
|  | PCRE2_SIZE *ovector; | ||
|  | PCRE2_SIZE ovecsave[3]; | ||
|  | pcre2_substitute_callout_block scb; | ||
|  | 
 | ||
|  | /* General initialization */ | ||
|  | 
 | ||
|  | buff_offset = 0; | ||
|  | lengthleft = buff_length = *blength; | ||
|  | *blength = PCRE2_UNSET; | ||
|  | ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; | ||
|  | 
 | ||
|  | /* Partial matching is not valid. This must come after setting *blength to
 | ||
|  | PCRE2_UNSET, so as not to imply an offset in the replacement. */ | ||
|  | 
 | ||
|  | if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) | ||
|  |   return PCRE2_ERROR_BADOPTION; | ||
|  |    | ||
|  | /* Validate length and find the end of the replacement. A NULL replacement of 
 | ||
|  | zero length is interpreted as an empty string. */ | ||
|  | 
 | ||
|  | if (replacement == NULL)  | ||
|  |   { | ||
|  |   if (rlength != 0) return PCRE2_ERROR_NULL; | ||
|  |   replacement = (PCRE2_SPTR)"";  | ||
|  |   }  | ||
|  |     | ||
|  | if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement); | ||
|  | repend = replacement + rlength; | ||
|  | 
 | ||
|  | /* Check for using a match that has already happened. Note that the subject
 | ||
|  | pointer in the match data may be NULL after a no-match. */ | ||
|  | 
 | ||
|  | use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); | ||
|  | replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); | ||
|  | 
 | ||
|  | /* If starting from an existing match, there must be an externally provided
 | ||
|  | match data block. We create an internal match_data block in two cases: (a) an | ||
|  | external one is not supplied (and we are not starting from an existing match); | ||
|  | (b) an existing match is to be used for the first substitution. In the latter | ||
|  | case, we copy the existing match into the internal block. This ensures that no | ||
|  | changes are made to the existing match data block. */ | ||
|  | 
 | ||
|  | if (match_data == NULL) | ||
|  |   { | ||
|  |   pcre2_general_context *gcontext; | ||
|  |   if (use_existing_match) return PCRE2_ERROR_NULL; | ||
|  |   gcontext = (mcontext == NULL)? | ||
|  |     (pcre2_general_context *)code : | ||
|  |     (pcre2_general_context *)mcontext; | ||
|  |   match_data = internal_match_data = | ||
|  |     pcre2_match_data_create_from_pattern(code, gcontext); | ||
|  |   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; | ||
|  |   } | ||
|  | 
 | ||
|  | else if (use_existing_match) | ||
|  |   { | ||
|  |   pcre2_general_context *gcontext = (mcontext == NULL)? | ||
|  |     (pcre2_general_context *)code : | ||
|  |     (pcre2_general_context *)mcontext; | ||
|  |   int pairs = (code->top_bracket + 1 < match_data->oveccount)? | ||
|  |     code->top_bracket + 1 : match_data->oveccount; | ||
|  |   internal_match_data = pcre2_match_data_create(match_data->oveccount, | ||
|  |     gcontext); | ||
|  |   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; | ||
|  |   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) | ||
|  |     + 2*pairs*sizeof(PCRE2_SIZE)); | ||
|  |   match_data = internal_match_data; | ||
|  |   } | ||
|  | 
 | ||
|  | /* Remember ovector details */ | ||
|  | 
 | ||
|  | ovector = pcre2_get_ovector_pointer(match_data); | ||
|  | ovector_count = pcre2_get_ovector_count(match_data); | ||
|  | 
 | ||
|  | /* Fixed things in the callout block */ | ||
|  | 
 | ||
|  | scb.version = 0; | ||
|  | scb.input = subject; | ||
|  | scb.output = (PCRE2_SPTR)buffer; | ||
|  | scb.ovector = ovector; | ||
|  | 
 | ||
|  | /* A NULL subject of zero length is treated as an empty string. */ | ||
|  | 
 | ||
|  | if (subject == NULL) | ||
|  |   { | ||
|  |   if (length != 0) return PCRE2_ERROR_NULL;  | ||
|  |   subject = (PCRE2_SPTR)""; | ||
|  |   }  | ||
|  | 
 | ||
|  | /* Find length of zero-terminated subject */ | ||
|  | 
 | ||
|  | if (length == PCRE2_ZERO_TERMINATED) | ||
|  |   length = subject? PRIV(strlen)(subject) : 0; | ||
|  | 
 | ||
|  | /* Check UTF replacement string if necessary. */ | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  | if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) | ||
|  |   { | ||
|  |   rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); | ||
|  |   if (rc != 0) | ||
|  |     { | ||
|  |     match_data->leftchar = 0; | ||
|  |     goto EXIT; | ||
|  |     } | ||
|  |   } | ||
|  | #endif  /* SUPPORT_UNICODE */
 | ||
|  | 
 | ||
|  | /* Save the substitute options and remove them from the match options. */ | ||
|  | 
 | ||
|  | suboptions = options & SUBSTITUTE_OPTIONS; | ||
|  | options &= ~SUBSTITUTE_OPTIONS; | ||
|  | 
 | ||
|  | /* Error if the start match offset is greater than the length of the subject. */ | ||
|  | 
 | ||
|  | if (start_offset > length) | ||
|  |   { | ||
|  |   match_data->leftchar = 0; | ||
|  |   rc = PCRE2_ERROR_BADOFFSET; | ||
|  |   goto EXIT; | ||
|  |   } | ||
|  | 
 | ||
|  | /* Copy up to the start offset, unless only the replacement is required. */ | ||
|  | 
 | ||
|  | if (!replacement_only) CHECKMEMCPY(subject, start_offset); | ||
|  | 
 | ||
|  | /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
 | ||
|  | match is taken from the match_data that was passed in. */ | ||
|  | 
 | ||
|  | subs = 0; | ||
|  | do | ||
|  |   { | ||
|  |   PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; | ||
|  |   uint32_t ptrstackptr = 0; | ||
|  | 
 | ||
|  |   if (use_existing_match) | ||
|  |     { | ||
|  |     rc = match_data->rc; | ||
|  |     use_existing_match = FALSE; | ||
|  |     } | ||
|  |   else rc = pcre2_match(code, subject, length, start_offset, options|goptions, | ||
|  |     match_data, mcontext); | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |   if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */ | ||
|  | #endif
 | ||
|  | 
 | ||
|  |   /* Any error other than no match returns the error code. No match when not
 | ||
|  |   doing the special after-empty-match global rematch, or when at the end of the | ||
|  |   subject, breaks the global loop. Otherwise, advance the starting point by one | ||
|  |   character, copying it to the output, and try again. */ | ||
|  | 
 | ||
|  |   if (rc < 0) | ||
|  |     { | ||
|  |     PCRE2_SIZE save_start; | ||
|  | 
 | ||
|  |     if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; | ||
|  |     if (goptions == 0 || start_offset >= length) break; | ||
|  | 
 | ||
|  |     /* Advance by one code point. Then, if CRLF is a valid newline sequence and
 | ||
|  |     we have advanced into the middle of it, advance one more code point. In | ||
|  |     other words, do not start in the middle of CRLF, even if CR and LF on their | ||
|  |     own are valid newlines. */ | ||
|  | 
 | ||
|  |     save_start = start_offset++; | ||
|  |     if (subject[start_offset-1] == CHAR_CR && | ||
|  |         code->newline_convention != PCRE2_NEWLINE_CR && | ||
|  |         code->newline_convention != PCRE2_NEWLINE_LF && | ||
|  |         start_offset < length && | ||
|  |         subject[start_offset] == CHAR_LF) | ||
|  |       start_offset++; | ||
|  | 
 | ||
|  |     /* Otherwise, in UTF mode, advance past any secondary code points. */ | ||
|  | 
 | ||
|  |     else if ((code->overall_options & PCRE2_UTF) != 0) | ||
|  |       { | ||
|  | #if PCRE2_CODE_UNIT_WIDTH == 8
 | ||
|  |       while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) | ||
|  |         start_offset++; | ||
|  | #elif PCRE2_CODE_UNIT_WIDTH == 16
 | ||
|  |       while (start_offset < length && | ||
|  |             (subject[start_offset] & 0xfc00) == 0xdc00) | ||
|  |         start_offset++; | ||
|  | #endif
 | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Copy what we have advanced past (unless not required), reset the special
 | ||
|  |     global options, and continue to the next match. */ | ||
|  | 
 | ||
|  |     fraglength = start_offset - save_start; | ||
|  |     if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); | ||
|  |     goptions = 0; | ||
|  |     continue; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Handle a successful match. Matches that use \K to end before they start
 | ||
|  |   or start before the current point in the subject are not supported. */ | ||
|  | 
 | ||
|  |   if (ovector[1] < ovector[0] || ovector[0] < start_offset) | ||
|  |     { | ||
|  |     rc = PCRE2_ERROR_BADSUBSPATTERN; | ||
|  |     goto EXIT; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Check for the same match as previous. This is legitimate after matching an
 | ||
|  |   empty string that starts after the initial match offset. We have tried again | ||
|  |   at the match point in case the pattern is one like /(?<=\G.)/ which can never | ||
|  |   match at its starting point, so running the match achieves the bumpalong. If | ||
|  |   we do get the same (null) match at the original match point, it isn't such a | ||
|  |   pattern, so we now do the empty string magic. In all other cases, a repeat | ||
|  |   match should never occur. */ | ||
|  | 
 | ||
|  |   if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) | ||
|  |     { | ||
|  |     if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) | ||
|  |       { | ||
|  |       goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; | ||
|  |       ovecsave[2] = start_offset; | ||
|  |       continue;    /* Back to the top of the loop */ | ||
|  |       } | ||
|  |     rc = PCRE2_ERROR_INTERNAL_DUPMATCH; | ||
|  |     goto EXIT; | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Count substitutions with a paranoid check for integer overflow; surely no
 | ||
|  |   real call to this function would ever hit this! */ | ||
|  | 
 | ||
|  |   if (subs == INT_MAX) | ||
|  |     { | ||
|  |     rc = PCRE2_ERROR_TOOMANYREPLACE; | ||
|  |     goto EXIT; | ||
|  |     } | ||
|  |   subs++; | ||
|  | 
 | ||
|  |   /* Copy the text leading up to the match (unless not required), and remember
 | ||
|  |   where the insert begins and how many ovector pairs are set. */ | ||
|  | 
 | ||
|  |   if (rc == 0) rc = ovector_count; | ||
|  |   fraglength = ovector[0] - start_offset; | ||
|  |   if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); | ||
|  |   scb.output_offsets[0] = buff_offset; | ||
|  |   scb.oveccount = rc; | ||
|  | 
 | ||
|  |   /* Process the replacement string. If the entire replacement is literal, just
 | ||
|  |   copy it with length check. */ | ||
|  | 
 | ||
|  |   ptr = replacement; | ||
|  |   if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) | ||
|  |     { | ||
|  |     CHECKMEMCPY(ptr, rlength); | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Within a non-literal replacement, which must be scanned character by
 | ||
|  |   character, local literal mode can be set by \Q, but only in extended mode | ||
|  |   when backslashes are being interpreted. In extended mode we must handle | ||
|  |   nested substrings that are to be reprocessed. */ | ||
|  | 
 | ||
|  |   else for (;;) | ||
|  |     { | ||
|  |     uint32_t ch; | ||
|  |     unsigned int chlen; | ||
|  | 
 | ||
|  |     /* If at the end of a nested substring, pop the stack. */ | ||
|  | 
 | ||
|  |     if (ptr >= repend) | ||
|  |       { | ||
|  |       if (ptrstackptr == 0) break;       /* End of replacement string */ | ||
|  |       repend = ptrstack[--ptrstackptr]; | ||
|  |       ptr = ptrstack[--ptrstackptr]; | ||
|  |       continue; | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Handle the next character */ | ||
|  | 
 | ||
|  |     if (escaped_literal) | ||
|  |       { | ||
|  |       if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) | ||
|  |         { | ||
|  |         escaped_literal = FALSE; | ||
|  |         ptr += 2; | ||
|  |         continue; | ||
|  |         } | ||
|  |       goto LOADLITERAL; | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Not in literal mode. */ | ||
|  | 
 | ||
|  |     if (*ptr == CHAR_DOLLAR_SIGN) | ||
|  |       { | ||
|  |       int group, n; | ||
|  |       uint32_t special = 0; | ||
|  |       BOOL inparens; | ||
|  |       BOOL star; | ||
|  |       PCRE2_SIZE sublength; | ||
|  |       PCRE2_SPTR text1_start = NULL; | ||
|  |       PCRE2_SPTR text1_end = NULL; | ||
|  |       PCRE2_SPTR text2_start = NULL; | ||
|  |       PCRE2_SPTR text2_end = NULL; | ||
|  |       PCRE2_UCHAR next; | ||
|  |       PCRE2_UCHAR name[33]; | ||
|  | 
 | ||
|  |       if (++ptr >= repend) goto BAD; | ||
|  |       if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL; | ||
|  | 
 | ||
|  |       group = -1; | ||
|  |       n = 0; | ||
|  |       inparens = FALSE; | ||
|  |       star = FALSE; | ||
|  | 
 | ||
|  |       if (next == CHAR_LEFT_CURLY_BRACKET) | ||
|  |         { | ||
|  |         if (++ptr >= repend) goto BAD; | ||
|  |         next = *ptr; | ||
|  |         inparens = TRUE; | ||
|  |         } | ||
|  | 
 | ||
|  |       if (next == CHAR_ASTERISK) | ||
|  |         { | ||
|  |         if (++ptr >= repend) goto BAD; | ||
|  |         next = *ptr; | ||
|  |         star = TRUE; | ||
|  |         } | ||
|  | 
 | ||
|  |       if (!star && next >= CHAR_0 && next <= CHAR_9) | ||
|  |         { | ||
|  |         group = next - CHAR_0; | ||
|  |         while (++ptr < repend) | ||
|  |           { | ||
|  |           next = *ptr; | ||
|  |           if (next < CHAR_0 || next > CHAR_9) break; | ||
|  |           group = group * 10 + next - CHAR_0; | ||
|  | 
 | ||
|  |           /* A check for a number greater than the hightest captured group
 | ||
|  |           is sufficient here; no need for a separate overflow check. If unknown | ||
|  |           groups are to be treated as unset, just skip over any remaining | ||
|  |           digits and carry on. */ | ||
|  | 
 | ||
|  |           if (group > code->top_bracket) | ||
|  |             { | ||
|  |             if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) | ||
|  |               { | ||
|  |               while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9); | ||
|  |               break; | ||
|  |               } | ||
|  |             else | ||
|  |               { | ||
|  |               rc = PCRE2_ERROR_NOSUBSTRING; | ||
|  |               goto PTREXIT; | ||
|  |               } | ||
|  |             } | ||
|  |           } | ||
|  |         } | ||
|  |       else | ||
|  |         { | ||
|  |         const uint8_t *ctypes = code->tables + ctypes_offset; | ||
|  |         while (MAX_255(next) && (ctypes[next] & ctype_word) != 0) | ||
|  |           { | ||
|  |           name[n++] = next; | ||
|  |           if (n > 32) goto BAD; | ||
|  |           if (++ptr >= repend) break; | ||
|  |           next = *ptr; | ||
|  |           } | ||
|  |         if (n == 0) goto BAD; | ||
|  |         name[n] = 0; | ||
|  |         } | ||
|  | 
 | ||
|  |       /* In extended mode we recognize ${name:+set text:unset text} and
 | ||
|  |       ${name:-default text}. */ | ||
|  | 
 | ||
|  |       if (inparens) | ||
|  |         { | ||
|  |         if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && | ||
|  |              !star && ptr < repend - 2 && next == CHAR_COLON) | ||
|  |           { | ||
|  |           special = *(++ptr); | ||
|  |           if (special != CHAR_PLUS && special != CHAR_MINUS) | ||
|  |             { | ||
|  |             rc = PCRE2_ERROR_BADSUBSTITUTION; | ||
|  |             goto PTREXIT; | ||
|  |             } | ||
|  | 
 | ||
|  |           text1_start = ++ptr; | ||
|  |           rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS); | ||
|  |           if (rc != 0) goto PTREXIT; | ||
|  |           text1_end = ptr; | ||
|  | 
 | ||
|  |           if (special == CHAR_PLUS && *ptr == CHAR_COLON) | ||
|  |             { | ||
|  |             text2_start = ++ptr; | ||
|  |             rc = find_text_end(code, &ptr, repend, TRUE); | ||
|  |             if (rc != 0) goto PTREXIT; | ||
|  |             text2_end = ptr; | ||
|  |             } | ||
|  |           } | ||
|  | 
 | ||
|  |         else | ||
|  |           { | ||
|  |           if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET) | ||
|  |             { | ||
|  |             rc = PCRE2_ERROR_REPMISSINGBRACE; | ||
|  |             goto PTREXIT; | ||
|  |             } | ||
|  |           } | ||
|  | 
 | ||
|  |         ptr++; | ||
|  |         } | ||
|  | 
 | ||
|  |       /* Have found a syntactically correct group number or name, or *name.
 | ||
|  |       Only *MARK is currently recognized. */ | ||
|  | 
 | ||
|  |       if (star) | ||
|  |         { | ||
|  |         if (PRIV(strcmp_c8)(name, STRING_MARK) == 0) | ||
|  |           { | ||
|  |           PCRE2_SPTR mark = pcre2_get_mark(match_data); | ||
|  |           if (mark != NULL) | ||
|  |             { | ||
|  |             PCRE2_SPTR mark_start = mark; | ||
|  |             while (*mark != 0) mark++; | ||
|  |             fraglength = mark - mark_start; | ||
|  |             CHECKMEMCPY(mark_start, fraglength); | ||
|  |             } | ||
|  |           } | ||
|  |         else goto BAD; | ||
|  |         } | ||
|  | 
 | ||
|  |       /* Substitute the contents of a group. We don't use substring_copy
 | ||
|  |       functions any more, in order to support case forcing. */ | ||
|  | 
 | ||
|  |       else | ||
|  |         { | ||
|  |         PCRE2_SPTR subptr, subptrend; | ||
|  | 
 | ||
|  |         /* Find a number for a named group. In case there are duplicate names,
 | ||
|  |         search for the first one that is set. If the name is not found when | ||
|  |         PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a | ||
|  |         non-existent group. */ | ||
|  | 
 | ||
|  |         if (group < 0) | ||
|  |           { | ||
|  |           PCRE2_SPTR first, last, entry; | ||
|  |           rc = pcre2_substring_nametable_scan(code, name, &first, &last); | ||
|  |           if (rc == PCRE2_ERROR_NOSUBSTRING && | ||
|  |               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) | ||
|  |             { | ||
|  |             group = code->top_bracket + 1; | ||
|  |             } | ||
|  |           else | ||
|  |             { | ||
|  |             if (rc < 0) goto PTREXIT; | ||
|  |             for (entry = first; entry <= last; entry += rc) | ||
|  |               { | ||
|  |               uint32_t ng = GET2(entry, 0); | ||
|  |               if (ng < ovector_count) | ||
|  |                 { | ||
|  |                 if (group < 0) group = ng;          /* First in ovector */ | ||
|  |                 if (ovector[ng*2] != PCRE2_UNSET) | ||
|  |                   { | ||
|  |                   group = ng;                       /* First that is set */ | ||
|  |                   break; | ||
|  |                   } | ||
|  |                 } | ||
|  |               } | ||
|  | 
 | ||
|  |             /* If group is still negative, it means we did not find a group
 | ||
|  |             that is in the ovector. Just set the first group. */ | ||
|  | 
 | ||
|  |             if (group < 0) group = GET2(first, 0); | ||
|  |             } | ||
|  |           } | ||
|  | 
 | ||
|  |         /* We now have a group that is identified by number. Find the length of
 | ||
|  |         the captured string. If a group in a non-special substitution is unset | ||
|  |         when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */ | ||
|  | 
 | ||
|  |         rc = pcre2_substring_length_bynumber(match_data, group, &sublength); | ||
|  |         if (rc < 0) | ||
|  |           { | ||
|  |           if (rc == PCRE2_ERROR_NOSUBSTRING && | ||
|  |               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0) | ||
|  |             { | ||
|  |             rc = PCRE2_ERROR_UNSET; | ||
|  |             } | ||
|  |           if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */ | ||
|  |           if (special == 0)                           /* Plain substitution */ | ||
|  |             { | ||
|  |             if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue; | ||
|  |             goto PTREXIT;                             /* Else error */ | ||
|  |             } | ||
|  |           } | ||
|  | 
 | ||
|  |         /* If special is '+' we have a 'set' and possibly an 'unset' text,
 | ||
|  |         both of which are reprocessed when used. If special is '-' we have a | ||
|  |         default text for when the group is unset; it must be reprocessed. */ | ||
|  | 
 | ||
|  |         if (special != 0) | ||
|  |           { | ||
|  |           if (special == CHAR_MINUS) | ||
|  |             { | ||
|  |             if (rc == 0) goto LITERAL_SUBSTITUTE; | ||
|  |             text2_start = text1_start; | ||
|  |             text2_end = text1_end; | ||
|  |             } | ||
|  | 
 | ||
|  |           if (ptrstackptr >= PTR_STACK_SIZE) goto BAD; | ||
|  |           ptrstack[ptrstackptr++] = ptr; | ||
|  |           ptrstack[ptrstackptr++] = repend; | ||
|  | 
 | ||
|  |           if (rc == 0) | ||
|  |             { | ||
|  |             ptr = text1_start; | ||
|  |             repend = text1_end; | ||
|  |             } | ||
|  |           else | ||
|  |             { | ||
|  |             ptr = text2_start; | ||
|  |             repend = text2_end; | ||
|  |             } | ||
|  |           continue; | ||
|  |           } | ||
|  | 
 | ||
|  |         /* Otherwise we have a literal substitution of a group's contents. */ | ||
|  | 
 | ||
|  |         LITERAL_SUBSTITUTE: | ||
|  |         subptr = subject + ovector[group*2]; | ||
|  |         subptrend = subject + ovector[group*2 + 1]; | ||
|  | 
 | ||
|  |         /* Substitute a literal string, possibly forcing alphabetic case. */ | ||
|  | 
 | ||
|  |         while (subptr < subptrend) | ||
|  |           { | ||
|  |           GETCHARINCTEST(ch, subptr); | ||
|  |           if (forcecase != 0) | ||
|  |             { | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |             if (utf || ucp) | ||
|  |               { | ||
|  |               uint32_t type = UCD_CHARTYPE(ch); | ||
|  |               if (PRIV(ucp_gentype)[type] == ucp_L && | ||
|  |                   type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) | ||
|  |                 ch = UCD_OTHERCASE(ch); | ||
|  |               } | ||
|  |             else | ||
|  | #endif
 | ||
|  |               { | ||
|  |               if (((code->tables + cbits_offset + | ||
|  |                   ((forcecase > 0)? cbit_upper:cbit_lower) | ||
|  |                   )[ch/8] & (1u << (ch%8))) == 0) | ||
|  |                 ch = (code->tables + fcc_offset)[ch]; | ||
|  |               } | ||
|  |             forcecase = forcecasereset; | ||
|  |             } | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |           if (utf) chlen = PRIV(ord2utf)(ch, temp); else | ||
|  | #endif
 | ||
|  |             { | ||
|  |             temp[0] = ch; | ||
|  |             chlen = 1; | ||
|  |             } | ||
|  |           CHECKMEMCPY(temp, chlen); | ||
|  |           } | ||
|  |         } | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Handle an escape sequence in extended mode. We can use check_escape()
 | ||
|  |     to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but | ||
|  |     the case-forcing escapes are not supported in pcre2_compile() so must be | ||
|  |     recognized here. */ | ||
|  | 
 | ||
|  |     else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 && | ||
|  |               *ptr == CHAR_BACKSLASH) | ||
|  |       { | ||
|  |       int errorcode; | ||
|  | 
 | ||
|  |       if (ptr < repend - 1) switch (ptr[1]) | ||
|  |         { | ||
|  |         case CHAR_L: | ||
|  |         forcecase = forcecasereset = -1; | ||
|  |         ptr += 2; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         case CHAR_l: | ||
|  |         forcecase = -1; | ||
|  |         forcecasereset = 0; | ||
|  |         ptr += 2; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         case CHAR_U: | ||
|  |         forcecase = forcecasereset = 1; | ||
|  |         ptr += 2; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         case CHAR_u: | ||
|  |         forcecase = 1; | ||
|  |         forcecasereset = 0; | ||
|  |         ptr += 2; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         default: | ||
|  |         break; | ||
|  |         } | ||
|  | 
 | ||
|  |       ptr++;  /* Point after \ */ | ||
|  |       rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode, | ||
|  |         code->overall_options, code->extra_options, FALSE, NULL); | ||
|  |       if (errorcode != 0) goto BADESCAPE; | ||
|  | 
 | ||
|  |       switch(rc) | ||
|  |         { | ||
|  |         case ESC_E: | ||
|  |         forcecase = forcecasereset = 0; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         case ESC_Q: | ||
|  |         escaped_literal = TRUE; | ||
|  |         continue; | ||
|  | 
 | ||
|  |         case 0:      /* Data character */ | ||
|  |         goto LITERAL; | ||
|  | 
 | ||
|  |         default: | ||
|  |         goto BADESCAPE; | ||
|  |         } | ||
|  |       } | ||
|  | 
 | ||
|  |     /* Handle a literal code unit */ | ||
|  | 
 | ||
|  |     else | ||
|  |       { | ||
|  |       LOADLITERAL: | ||
|  |       GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */ | ||
|  | 
 | ||
|  |       LITERAL: | ||
|  |       if (forcecase != 0) | ||
|  |         { | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |         if (utf || ucp) | ||
|  |           { | ||
|  |           uint32_t type = UCD_CHARTYPE(ch); | ||
|  |           if (PRIV(ucp_gentype)[type] == ucp_L && | ||
|  |               type != ((forcecase > 0)? ucp_Lu : ucp_Ll)) | ||
|  |             ch = UCD_OTHERCASE(ch); | ||
|  |           } | ||
|  |         else | ||
|  | #endif
 | ||
|  |           { | ||
|  |           if (((code->tables + cbits_offset + | ||
|  |               ((forcecase > 0)? cbit_upper:cbit_lower) | ||
|  |               )[ch/8] & (1u << (ch%8))) == 0) | ||
|  |             ch = (code->tables + fcc_offset)[ch]; | ||
|  |           } | ||
|  |         forcecase = forcecasereset; | ||
|  |         } | ||
|  | 
 | ||
|  | #ifdef SUPPORT_UNICODE
 | ||
|  |       if (utf) chlen = PRIV(ord2utf)(ch, temp); else | ||
|  | #endif
 | ||
|  |         { | ||
|  |         temp[0] = ch; | ||
|  |         chlen = 1; | ||
|  |         } | ||
|  |       CHECKMEMCPY(temp, chlen); | ||
|  |       } /* End handling a literal code unit */ | ||
|  |     }   /* End of loop for scanning the replacement. */ | ||
|  | 
 | ||
|  |   /* The replacement has been copied to the output, or its size has been
 | ||
|  |   remembered. Do the callout if there is one and we have done an actual | ||
|  |   replacement. */ | ||
|  | 
 | ||
|  |   if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) | ||
|  |     { | ||
|  |     scb.subscount = subs; | ||
|  |     scb.output_offsets[1] = buff_offset; | ||
|  |     rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); | ||
|  | 
 | ||
|  |     /* A non-zero return means cancel this substitution. Instead, copy the
 | ||
|  |     matched string fragment. */ | ||
|  | 
 | ||
|  |     if (rc != 0) | ||
|  |       { | ||
|  |       PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; | ||
|  |       PCRE2_SIZE oldlength = ovector[1] - ovector[0]; | ||
|  | 
 | ||
|  |       buff_offset -= newlength; | ||
|  |       lengthleft += newlength; | ||
|  |       if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); | ||
|  | 
 | ||
|  |       /* A negative return means do not do any more. */ | ||
|  | 
 | ||
|  |       if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); | ||
|  |       } | ||
|  |     } | ||
|  | 
 | ||
|  |   /* Save the details of this match. See above for how this data is used. If we
 | ||
|  |   matched an empty string, do the magic for global matches. Update the start | ||
|  |   offset to point to the rest of the subject string. If we re-used an existing | ||
|  |   match for the first match, switch to the internal match data block. */ | ||
|  | 
 | ||
|  |   ovecsave[0] = ovector[0]; | ||
|  |   ovecsave[1] = ovector[1]; | ||
|  |   ovecsave[2] = start_offset; | ||
|  | 
 | ||
|  |   goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : | ||
|  |     PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; | ||
|  |   start_offset = ovector[1]; | ||
|  |   } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */ | ||
|  | 
 | ||
|  | /* Copy the rest of the subject unless not required, and terminate the output
 | ||
|  | with a binary zero. */ | ||
|  | 
 | ||
|  | if (!replacement_only) | ||
|  |   { | ||
|  |   fraglength = length - start_offset; | ||
|  |   CHECKMEMCPY(subject + start_offset, fraglength); | ||
|  |   } | ||
|  | 
 | ||
|  | temp[0] = 0; | ||
|  | CHECKMEMCPY(temp, 1); | ||
|  | 
 | ||
|  | /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
 | ||
|  | and matching has carried on after a full buffer, in order to compute the length | ||
|  | needed. Otherwise, an overflow generates an immediate error return. */ | ||
|  | 
 | ||
|  | if (overflowed) | ||
|  |   { | ||
|  |   rc = PCRE2_ERROR_NOMEMORY; | ||
|  |   *blength = buff_length + extra_needed; | ||
|  |   } | ||
|  | 
 | ||
|  | /* After a successful execution, return the number of substitutions and set the
 | ||
|  | length of buffer used, excluding the trailing zero. */ | ||
|  | 
 | ||
|  | else | ||
|  |   { | ||
|  |   rc = subs; | ||
|  |   *blength = buff_offset - 1; | ||
|  |   } | ||
|  | 
 | ||
|  | EXIT: | ||
|  | if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); | ||
|  |   else match_data->rc = rc; | ||
|  | return rc; | ||
|  | 
 | ||
|  | NOROOM: | ||
|  | rc = PCRE2_ERROR_NOMEMORY; | ||
|  | goto EXIT; | ||
|  | 
 | ||
|  | BAD: | ||
|  | rc = PCRE2_ERROR_BADREPLACEMENT; | ||
|  | goto PTREXIT; | ||
|  | 
 | ||
|  | BADESCAPE: | ||
|  | rc = PCRE2_ERROR_BADREPESCAPE; | ||
|  | 
 | ||
|  | PTREXIT: | ||
|  | *blength = (PCRE2_SIZE)(ptr - replacement); | ||
|  | goto EXIT; | ||
|  | } | ||
|  | 
 | ||
|  | /* End of pcre2_substitute.c */ |