1007 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1007 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*************************************************
 | |
| *      Perl-Compatible Regular Expressions       *
 | |
| *************************************************/
 | |
| 
 | |
| /* PCRE is a library of functions to support regular expressions whose syntax
 | |
| and semantics are as close as possible to those of the Perl 5 language.
 | |
| 
 | |
|                        Written by Philip Hazel
 | |
|      Original API code Copyright (c) 1997-2012 University of Cambridge
 | |
|           New API code Copyright (c) 2016-2021 University of Cambridge
 | |
| 
 | |
| -----------------------------------------------------------------------------
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are met:
 | |
| 
 | |
|     * Redistributions of source code must retain the above copyright notice,
 | |
|       this list of conditions and the following disclaimer.
 | |
| 
 | |
|     * Redistributions in binary form must reproduce the above copyright
 | |
|       notice, this list of conditions and the following disclaimer in the
 | |
|       documentation and/or other materials provided with the distribution.
 | |
| 
 | |
|     * Neither the name of the University of Cambridge nor the names of its
 | |
|       contributors may be used to endorse or promote products derived from
 | |
|       this software without specific prior written permission.
 | |
| 
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 | |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 | |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 | |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 | |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 | |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 | |
| POSSIBILITY OF SUCH DAMAGE.
 | |
| -----------------------------------------------------------------------------
 | |
| */
 | |
| 
 | |
| 
 | |
| #ifdef HAVE_CONFIG_H
 | |
| #include "config.h"
 | |
| #endif
 | |
| 
 | |
| #include "pcre2_internal.h"
 | |
| 
 | |
| #define PTR_STACK_SIZE 20
 | |
| 
 | |
| #define SUBSTITUTE_OPTIONS \
 | |
|   (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \
 | |
|    PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \
 | |
|    PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \
 | |
|    PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY)
 | |
| 
 | |
| 
 | |
| 
 | |
| /*************************************************
 | |
| *           Find end of substitute text          *
 | |
| *************************************************/
 | |
| 
 | |
| /* In extended mode, we recognize ${name:+set text:unset text} and similar
 | |
| constructions. This requires the identification of unescaped : and }
 | |
| characters. This function scans for such. It must deal with nested ${
 | |
| constructions. The pointer to the text is updated, either to the required end
 | |
| character, or to where an error was detected.
 | |
| 
 | |
| Arguments:
 | |
|   code      points to the compiled expression (for options)
 | |
|   ptrptr    points to the pointer to the start of the text (updated)
 | |
|   ptrend    end of the whole string
 | |
|   last      TRUE if the last expected string (only } recognized)
 | |
| 
 | |
| Returns:    0 on success
 | |
|             negative error code on failure
 | |
| */
 | |
| 
 | |
| static int
 | |
| find_text_end(const pcre2_code *code, PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
 | |
|   BOOL last)
 | |
| {
 | |
| int rc = 0;
 | |
| uint32_t nestlevel = 0;
 | |
| BOOL literal = FALSE;
 | |
| PCRE2_SPTR ptr = *ptrptr;
 | |
| 
 | |
| for (; ptr < ptrend; ptr++)
 | |
|   {
 | |
|   if (literal)
 | |
|     {
 | |
|     if (ptr[0] == CHAR_BACKSLASH && ptr < ptrend - 1 && ptr[1] == CHAR_E)
 | |
|       {
 | |
|       literal = FALSE;
 | |
|       ptr += 1;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|   else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
 | |
|     {
 | |
|     if (nestlevel == 0) goto EXIT;
 | |
|     nestlevel--;
 | |
|     }
 | |
| 
 | |
|   else if (*ptr == CHAR_COLON && !last && nestlevel == 0) goto EXIT;
 | |
| 
 | |
|   else if (*ptr == CHAR_DOLLAR_SIGN)
 | |
|     {
 | |
|     if (ptr < ptrend - 1 && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 | |
|       {
 | |
|       nestlevel++;
 | |
|       ptr += 1;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|   else if (*ptr == CHAR_BACKSLASH)
 | |
|     {
 | |
|     int erc;
 | |
|     int errorcode;
 | |
|     uint32_t ch;
 | |
| 
 | |
|     if (ptr < ptrend - 1) switch (ptr[1])
 | |
|       {
 | |
|       case CHAR_L:
 | |
|       case CHAR_l:
 | |
|       case CHAR_U:
 | |
|       case CHAR_u:
 | |
|       ptr += 1;
 | |
|       continue;
 | |
|       }
 | |
| 
 | |
|     ptr += 1;  /* Must point after \ */
 | |
|     erc = PRIV(check_escape)(&ptr, ptrend, &ch, &errorcode,
 | |
|       code->overall_options, code->extra_options, FALSE, NULL);
 | |
|     ptr -= 1;  /* Back to last code unit of escape */
 | |
|     if (errorcode != 0)
 | |
|       {
 | |
|       rc = errorcode;
 | |
|       goto EXIT;
 | |
|       }
 | |
| 
 | |
|     switch(erc)
 | |
|       {
 | |
|       case 0:      /* Data character */
 | |
|       case ESC_E:  /* Isolated \E is ignored */
 | |
|       break;
 | |
| 
 | |
|       case ESC_Q:
 | |
|       literal = TRUE;
 | |
|       break;
 | |
| 
 | |
|       default:
 | |
|       rc = PCRE2_ERROR_BADREPESCAPE;
 | |
|       goto EXIT;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
| rc = PCRE2_ERROR_REPMISSINGBRACE;   /* Terminator not found */
 | |
| 
 | |
| EXIT:
 | |
| *ptrptr = ptr;
 | |
| return rc;
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| /*************************************************
 | |
| *              Match and substitute              *
 | |
| *************************************************/
 | |
| 
 | |
| /* This function applies a compiled re to a subject string and creates a new
 | |
| string with substitutions. The first 7 arguments are the same as for
 | |
| pcre2_match(). Either string length may be PCRE2_ZERO_TERMINATED.
 | |
| 
 | |
| Arguments:
 | |
|   code            points to the compiled expression
 | |
|   subject         points to the subject string
 | |
|   length          length of subject string (may contain binary zeros)
 | |
|   start_offset    where to start in the subject string
 | |
|   options         option bits
 | |
|   match_data      points to a match_data block, or is NULL
 | |
|   context         points a PCRE2 context
 | |
|   replacement     points to the replacement string
 | |
|   rlength         length of replacement string
 | |
|   buffer          where to put the substituted string
 | |
|   blength         points to length of buffer; updated to length of string
 | |
| 
 | |
| Returns:          >= 0 number of substitutions made
 | |
|                   < 0 an error code
 | |
|                   PCRE2_ERROR_BADREPLACEMENT means invalid use of $
 | |
| */
 | |
| 
 | |
| /* This macro checks for space in the buffer before copying into it. On
 | |
| overflow, either give an error immediately, or keep on, accumulating the
 | |
| length. */
 | |
| 
 | |
| #define CHECKMEMCPY(from,length) \
 | |
|   { \
 | |
|   if (!overflowed && lengthleft < length) \
 | |
|     { \
 | |
|     if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \
 | |
|     overflowed = TRUE; \
 | |
|     extra_needed = length - lengthleft; \
 | |
|     } \
 | |
|   else if (overflowed) \
 | |
|     { \
 | |
|     extra_needed += length; \
 | |
|     }  \
 | |
|   else \
 | |
|     {  \
 | |
|     memcpy(buffer + buff_offset, from, CU2BYTES(length)); \
 | |
|     buff_offset += length; \
 | |
|     lengthleft -= length; \
 | |
|     } \
 | |
|   }
 | |
| 
 | |
| /* Here's the function */
 | |
| 
 | |
| PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
 | |
| pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
 | |
|   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
 | |
|   pcre2_match_context *mcontext, PCRE2_SPTR replacement, PCRE2_SIZE rlength,
 | |
|   PCRE2_UCHAR *buffer, PCRE2_SIZE *blength)
 | |
| {
 | |
| int rc;
 | |
| int subs;
 | |
| int forcecase = 0;
 | |
| int forcecasereset = 0;
 | |
| uint32_t ovector_count;
 | |
| uint32_t goptions = 0;
 | |
| uint32_t suboptions;
 | |
| pcre2_match_data *internal_match_data = NULL;
 | |
| BOOL escaped_literal = FALSE;
 | |
| BOOL overflowed = FALSE;
 | |
| BOOL use_existing_match;
 | |
| BOOL replacement_only;
 | |
| #ifdef SUPPORT_UNICODE
 | |
| BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
 | |
| BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
 | |
| #endif
 | |
| PCRE2_UCHAR temp[6];
 | |
| PCRE2_SPTR ptr;
 | |
| PCRE2_SPTR repend;
 | |
| PCRE2_SIZE extra_needed = 0;
 | |
| PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
 | |
| PCRE2_SIZE *ovector;
 | |
| PCRE2_SIZE ovecsave[3];
 | |
| pcre2_substitute_callout_block scb;
 | |
| 
 | |
| /* General initialization */
 | |
| 
 | |
| buff_offset = 0;
 | |
| lengthleft = buff_length = *blength;
 | |
| *blength = PCRE2_UNSET;
 | |
| ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
 | |
| 
 | |
| /* Partial matching is not valid. This must come after setting *blength to
 | |
| PCRE2_UNSET, so as not to imply an offset in the replacement. */
 | |
| 
 | |
| if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0)
 | |
|   return PCRE2_ERROR_BADOPTION;
 | |
|   
 | |
| /* Validate length and find the end of the replacement. A NULL replacement of 
 | |
| zero length is interpreted as an empty string. */
 | |
| 
 | |
| if (replacement == NULL) 
 | |
|   {
 | |
|   if (rlength != 0) return PCRE2_ERROR_NULL;
 | |
|   replacement = (PCRE2_SPTR)""; 
 | |
|   } 
 | |
|    
 | |
| if (rlength == PCRE2_ZERO_TERMINATED) rlength = PRIV(strlen)(replacement);
 | |
| repend = replacement + rlength;
 | |
| 
 | |
| /* Check for using a match that has already happened. Note that the subject
 | |
| pointer in the match data may be NULL after a no-match. */
 | |
| 
 | |
| use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0);
 | |
| replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0);
 | |
| 
 | |
| /* If starting from an existing match, there must be an externally provided
 | |
| match data block. We create an internal match_data block in two cases: (a) an
 | |
| external one is not supplied (and we are not starting from an existing match);
 | |
| (b) an existing match is to be used for the first substitution. In the latter
 | |
| case, we copy the existing match into the internal block. This ensures that no
 | |
| changes are made to the existing match data block. */
 | |
| 
 | |
| if (match_data == NULL)
 | |
|   {
 | |
|   pcre2_general_context *gcontext;
 | |
|   if (use_existing_match) return PCRE2_ERROR_NULL;
 | |
|   gcontext = (mcontext == NULL)?
 | |
|     (pcre2_general_context *)code :
 | |
|     (pcre2_general_context *)mcontext;
 | |
|   match_data = internal_match_data =
 | |
|     pcre2_match_data_create_from_pattern(code, gcontext);
 | |
|   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
 | |
|   }
 | |
| 
 | |
| else if (use_existing_match)
 | |
|   {
 | |
|   pcre2_general_context *gcontext = (mcontext == NULL)?
 | |
|     (pcre2_general_context *)code :
 | |
|     (pcre2_general_context *)mcontext;
 | |
|   int pairs = (code->top_bracket + 1 < match_data->oveccount)?
 | |
|     code->top_bracket + 1 : match_data->oveccount;
 | |
|   internal_match_data = pcre2_match_data_create(match_data->oveccount,
 | |
|     gcontext);
 | |
|   if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY;
 | |
|   memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector)
 | |
|     + 2*pairs*sizeof(PCRE2_SIZE));
 | |
|   match_data = internal_match_data;
 | |
|   }
 | |
| 
 | |
| /* Remember ovector details */
 | |
| 
 | |
| ovector = pcre2_get_ovector_pointer(match_data);
 | |
| ovector_count = pcre2_get_ovector_count(match_data);
 | |
| 
 | |
| /* Fixed things in the callout block */
 | |
| 
 | |
| scb.version = 0;
 | |
| scb.input = subject;
 | |
| scb.output = (PCRE2_SPTR)buffer;
 | |
| scb.ovector = ovector;
 | |
| 
 | |
| /* A NULL subject of zero length is treated as an empty string. */
 | |
| 
 | |
| if (subject == NULL)
 | |
|   {
 | |
|   if (length != 0) return PCRE2_ERROR_NULL; 
 | |
|   subject = (PCRE2_SPTR)"";
 | |
|   } 
 | |
| 
 | |
| /* Find length of zero-terminated subject */
 | |
| 
 | |
| if (length == PCRE2_ZERO_TERMINATED)
 | |
|   length = subject? PRIV(strlen)(subject) : 0;
 | |
| 
 | |
| /* Check UTF replacement string if necessary. */
 | |
| 
 | |
| #ifdef SUPPORT_UNICODE
 | |
| if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
 | |
|   {
 | |
|   rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
 | |
|   if (rc != 0)
 | |
|     {
 | |
|     match_data->leftchar = 0;
 | |
|     goto EXIT;
 | |
|     }
 | |
|   }
 | |
| #endif  /* SUPPORT_UNICODE */
 | |
| 
 | |
| /* Save the substitute options and remove them from the match options. */
 | |
| 
 | |
| suboptions = options & SUBSTITUTE_OPTIONS;
 | |
| options &= ~SUBSTITUTE_OPTIONS;
 | |
| 
 | |
| /* Error if the start match offset is greater than the length of the subject. */
 | |
| 
 | |
| if (start_offset > length)
 | |
|   {
 | |
|   match_data->leftchar = 0;
 | |
|   rc = PCRE2_ERROR_BADOFFSET;
 | |
|   goto EXIT;
 | |
|   }
 | |
| 
 | |
| /* Copy up to the start offset, unless only the replacement is required. */
 | |
| 
 | |
| if (!replacement_only) CHECKMEMCPY(subject, start_offset);
 | |
| 
 | |
| /* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first
 | |
| match is taken from the match_data that was passed in. */
 | |
| 
 | |
| subs = 0;
 | |
| do
 | |
|   {
 | |
|   PCRE2_SPTR ptrstack[PTR_STACK_SIZE];
 | |
|   uint32_t ptrstackptr = 0;
 | |
| 
 | |
|   if (use_existing_match)
 | |
|     {
 | |
|     rc = match_data->rc;
 | |
|     use_existing_match = FALSE;
 | |
|     }
 | |
|   else rc = pcre2_match(code, subject, length, start_offset, options|goptions,
 | |
|     match_data, mcontext);
 | |
| 
 | |
| #ifdef SUPPORT_UNICODE
 | |
|   if (utf) options |= PCRE2_NO_UTF_CHECK;  /* Only need to check once */
 | |
| #endif
 | |
| 
 | |
|   /* Any error other than no match returns the error code. No match when not
 | |
|   doing the special after-empty-match global rematch, or when at the end of the
 | |
|   subject, breaks the global loop. Otherwise, advance the starting point by one
 | |
|   character, copying it to the output, and try again. */
 | |
| 
 | |
|   if (rc < 0)
 | |
|     {
 | |
|     PCRE2_SIZE save_start;
 | |
| 
 | |
|     if (rc != PCRE2_ERROR_NOMATCH) goto EXIT;
 | |
|     if (goptions == 0 || start_offset >= length) break;
 | |
| 
 | |
|     /* Advance by one code point. Then, if CRLF is a valid newline sequence and
 | |
|     we have advanced into the middle of it, advance one more code point. In
 | |
|     other words, do not start in the middle of CRLF, even if CR and LF on their
 | |
|     own are valid newlines. */
 | |
| 
 | |
|     save_start = start_offset++;
 | |
|     if (subject[start_offset-1] == CHAR_CR &&
 | |
|         code->newline_convention != PCRE2_NEWLINE_CR &&
 | |
|         code->newline_convention != PCRE2_NEWLINE_LF &&
 | |
|         start_offset < length &&
 | |
|         subject[start_offset] == CHAR_LF)
 | |
|       start_offset++;
 | |
| 
 | |
|     /* Otherwise, in UTF mode, advance past any secondary code points. */
 | |
| 
 | |
|     else if ((code->overall_options & PCRE2_UTF) != 0)
 | |
|       {
 | |
| #if PCRE2_CODE_UNIT_WIDTH == 8
 | |
|       while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80)
 | |
|         start_offset++;
 | |
| #elif PCRE2_CODE_UNIT_WIDTH == 16
 | |
|       while (start_offset < length &&
 | |
|             (subject[start_offset] & 0xfc00) == 0xdc00)
 | |
|         start_offset++;
 | |
| #endif
 | |
|       }
 | |
| 
 | |
|     /* Copy what we have advanced past (unless not required), reset the special
 | |
|     global options, and continue to the next match. */
 | |
| 
 | |
|     fraglength = start_offset - save_start;
 | |
|     if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength);
 | |
|     goptions = 0;
 | |
|     continue;
 | |
|     }
 | |
| 
 | |
|   /* Handle a successful match. Matches that use \K to end before they start
 | |
|   or start before the current point in the subject are not supported. */
 | |
| 
 | |
|   if (ovector[1] < ovector[0] || ovector[0] < start_offset)
 | |
|     {
 | |
|     rc = PCRE2_ERROR_BADSUBSPATTERN;
 | |
|     goto EXIT;
 | |
|     }
 | |
| 
 | |
|   /* Check for the same match as previous. This is legitimate after matching an
 | |
|   empty string that starts after the initial match offset. We have tried again
 | |
|   at the match point in case the pattern is one like /(?<=\G.)/ which can never
 | |
|   match at its starting point, so running the match achieves the bumpalong. If
 | |
|   we do get the same (null) match at the original match point, it isn't such a
 | |
|   pattern, so we now do the empty string magic. In all other cases, a repeat
 | |
|   match should never occur. */
 | |
| 
 | |
|   if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
 | |
|     {
 | |
|     if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
 | |
|       {
 | |
|       goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
 | |
|       ovecsave[2] = start_offset;
 | |
|       continue;    /* Back to the top of the loop */
 | |
|       }
 | |
|     rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
 | |
|     goto EXIT;
 | |
|     }
 | |
| 
 | |
|   /* Count substitutions with a paranoid check for integer overflow; surely no
 | |
|   real call to this function would ever hit this! */
 | |
| 
 | |
|   if (subs == INT_MAX)
 | |
|     {
 | |
|     rc = PCRE2_ERROR_TOOMANYREPLACE;
 | |
|     goto EXIT;
 | |
|     }
 | |
|   subs++;
 | |
| 
 | |
|   /* Copy the text leading up to the match (unless not required), and remember
 | |
|   where the insert begins and how many ovector pairs are set. */
 | |
| 
 | |
|   if (rc == 0) rc = ovector_count;
 | |
|   fraglength = ovector[0] - start_offset;
 | |
|   if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength);
 | |
|   scb.output_offsets[0] = buff_offset;
 | |
|   scb.oveccount = rc;
 | |
| 
 | |
|   /* Process the replacement string. If the entire replacement is literal, just
 | |
|   copy it with length check. */
 | |
| 
 | |
|   ptr = replacement;
 | |
|   if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0)
 | |
|     {
 | |
|     CHECKMEMCPY(ptr, rlength);
 | |
|     }
 | |
| 
 | |
|   /* Within a non-literal replacement, which must be scanned character by
 | |
|   character, local literal mode can be set by \Q, but only in extended mode
 | |
|   when backslashes are being interpreted. In extended mode we must handle
 | |
|   nested substrings that are to be reprocessed. */
 | |
| 
 | |
|   else for (;;)
 | |
|     {
 | |
|     uint32_t ch;
 | |
|     unsigned int chlen;
 | |
| 
 | |
|     /* If at the end of a nested substring, pop the stack. */
 | |
| 
 | |
|     if (ptr >= repend)
 | |
|       {
 | |
|       if (ptrstackptr == 0) break;       /* End of replacement string */
 | |
|       repend = ptrstack[--ptrstackptr];
 | |
|       ptr = ptrstack[--ptrstackptr];
 | |
|       continue;
 | |
|       }
 | |
| 
 | |
|     /* Handle the next character */
 | |
| 
 | |
|     if (escaped_literal)
 | |
|       {
 | |
|       if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E)
 | |
|         {
 | |
|         escaped_literal = FALSE;
 | |
|         ptr += 2;
 | |
|         continue;
 | |
|         }
 | |
|       goto LOADLITERAL;
 | |
|       }
 | |
| 
 | |
|     /* Not in literal mode. */
 | |
| 
 | |
|     if (*ptr == CHAR_DOLLAR_SIGN)
 | |
|       {
 | |
|       int group, n;
 | |
|       uint32_t special = 0;
 | |
|       BOOL inparens;
 | |
|       BOOL star;
 | |
|       PCRE2_SIZE sublength;
 | |
|       PCRE2_SPTR text1_start = NULL;
 | |
|       PCRE2_SPTR text1_end = NULL;
 | |
|       PCRE2_SPTR text2_start = NULL;
 | |
|       PCRE2_SPTR text2_end = NULL;
 | |
|       PCRE2_UCHAR next;
 | |
|       PCRE2_UCHAR name[33];
 | |
| 
 | |
|       if (++ptr >= repend) goto BAD;
 | |
|       if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
 | |
| 
 | |
|       group = -1;
 | |
|       n = 0;
 | |
|       inparens = FALSE;
 | |
|       star = FALSE;
 | |
| 
 | |
|       if (next == CHAR_LEFT_CURLY_BRACKET)
 | |
|         {
 | |
|         if (++ptr >= repend) goto BAD;
 | |
|         next = *ptr;
 | |
|         inparens = TRUE;
 | |
|         }
 | |
| 
 | |
|       if (next == CHAR_ASTERISK)
 | |
|         {
 | |
|         if (++ptr >= repend) goto BAD;
 | |
|         next = *ptr;
 | |
|         star = TRUE;
 | |
|         }
 | |
| 
 | |
|       if (!star && next >= CHAR_0 && next <= CHAR_9)
 | |
|         {
 | |
|         group = next - CHAR_0;
 | |
|         while (++ptr < repend)
 | |
|           {
 | |
|           next = *ptr;
 | |
|           if (next < CHAR_0 || next > CHAR_9) break;
 | |
|           group = group * 10 + next - CHAR_0;
 | |
| 
 | |
|           /* A check for a number greater than the hightest captured group
 | |
|           is sufficient here; no need for a separate overflow check. If unknown
 | |
|           groups are to be treated as unset, just skip over any remaining
 | |
|           digits and carry on. */
 | |
| 
 | |
|           if (group > code->top_bracket)
 | |
|             {
 | |
|             if ((suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
 | |
|               {
 | |
|               while (++ptr < repend && *ptr >= CHAR_0 && *ptr <= CHAR_9);
 | |
|               break;
 | |
|               }
 | |
|             else
 | |
|               {
 | |
|               rc = PCRE2_ERROR_NOSUBSTRING;
 | |
|               goto PTREXIT;
 | |
|               }
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       else
 | |
|         {
 | |
|         const uint8_t *ctypes = code->tables + ctypes_offset;
 | |
|         while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
 | |
|           {
 | |
|           name[n++] = next;
 | |
|           if (n > 32) goto BAD;
 | |
|           if (++ptr >= repend) break;
 | |
|           next = *ptr;
 | |
|           }
 | |
|         if (n == 0) goto BAD;
 | |
|         name[n] = 0;
 | |
|         }
 | |
| 
 | |
|       /* In extended mode we recognize ${name:+set text:unset text} and
 | |
|       ${name:-default text}. */
 | |
| 
 | |
|       if (inparens)
 | |
|         {
 | |
|         if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
 | |
|              !star && ptr < repend - 2 && next == CHAR_COLON)
 | |
|           {
 | |
|           special = *(++ptr);
 | |
|           if (special != CHAR_PLUS && special != CHAR_MINUS)
 | |
|             {
 | |
|             rc = PCRE2_ERROR_BADSUBSTITUTION;
 | |
|             goto PTREXIT;
 | |
|             }
 | |
| 
 | |
|           text1_start = ++ptr;
 | |
|           rc = find_text_end(code, &ptr, repend, special == CHAR_MINUS);
 | |
|           if (rc != 0) goto PTREXIT;
 | |
|           text1_end = ptr;
 | |
| 
 | |
|           if (special == CHAR_PLUS && *ptr == CHAR_COLON)
 | |
|             {
 | |
|             text2_start = ++ptr;
 | |
|             rc = find_text_end(code, &ptr, repend, TRUE);
 | |
|             if (rc != 0) goto PTREXIT;
 | |
|             text2_end = ptr;
 | |
|             }
 | |
|           }
 | |
| 
 | |
|         else
 | |
|           {
 | |
|           if (ptr >= repend || *ptr != CHAR_RIGHT_CURLY_BRACKET)
 | |
|             {
 | |
|             rc = PCRE2_ERROR_REPMISSINGBRACE;
 | |
|             goto PTREXIT;
 | |
|             }
 | |
|           }
 | |
| 
 | |
|         ptr++;
 | |
|         }
 | |
| 
 | |
|       /* Have found a syntactically correct group number or name, or *name.
 | |
|       Only *MARK is currently recognized. */
 | |
| 
 | |
|       if (star)
 | |
|         {
 | |
|         if (PRIV(strcmp_c8)(name, STRING_MARK) == 0)
 | |
|           {
 | |
|           PCRE2_SPTR mark = pcre2_get_mark(match_data);
 | |
|           if (mark != NULL)
 | |
|             {
 | |
|             PCRE2_SPTR mark_start = mark;
 | |
|             while (*mark != 0) mark++;
 | |
|             fraglength = mark - mark_start;
 | |
|             CHECKMEMCPY(mark_start, fraglength);
 | |
|             }
 | |
|           }
 | |
|         else goto BAD;
 | |
|         }
 | |
| 
 | |
|       /* Substitute the contents of a group. We don't use substring_copy
 | |
|       functions any more, in order to support case forcing. */
 | |
| 
 | |
|       else
 | |
|         {
 | |
|         PCRE2_SPTR subptr, subptrend;
 | |
| 
 | |
|         /* Find a number for a named group. In case there are duplicate names,
 | |
|         search for the first one that is set. If the name is not found when
 | |
|         PCRE2_SUBSTITUTE_UNKNOWN_EMPTY is set, set the group number to a
 | |
|         non-existent group. */
 | |
| 
 | |
|         if (group < 0)
 | |
|           {
 | |
|           PCRE2_SPTR first, last, entry;
 | |
|           rc = pcre2_substring_nametable_scan(code, name, &first, &last);
 | |
|           if (rc == PCRE2_ERROR_NOSUBSTRING &&
 | |
|               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
 | |
|             {
 | |
|             group = code->top_bracket + 1;
 | |
|             }
 | |
|           else
 | |
|             {
 | |
|             if (rc < 0) goto PTREXIT;
 | |
|             for (entry = first; entry <= last; entry += rc)
 | |
|               {
 | |
|               uint32_t ng = GET2(entry, 0);
 | |
|               if (ng < ovector_count)
 | |
|                 {
 | |
|                 if (group < 0) group = ng;          /* First in ovector */
 | |
|                 if (ovector[ng*2] != PCRE2_UNSET)
 | |
|                   {
 | |
|                   group = ng;                       /* First that is set */
 | |
|                   break;
 | |
|                   }
 | |
|                 }
 | |
|               }
 | |
| 
 | |
|             /* If group is still negative, it means we did not find a group
 | |
|             that is in the ovector. Just set the first group. */
 | |
| 
 | |
|             if (group < 0) group = GET2(first, 0);
 | |
|             }
 | |
|           }
 | |
| 
 | |
|         /* We now have a group that is identified by number. Find the length of
 | |
|         the captured string. If a group in a non-special substitution is unset
 | |
|         when PCRE2_SUBSTITUTE_UNSET_EMPTY is set, substitute nothing. */
 | |
| 
 | |
|         rc = pcre2_substring_length_bynumber(match_data, group, &sublength);
 | |
|         if (rc < 0)
 | |
|           {
 | |
|           if (rc == PCRE2_ERROR_NOSUBSTRING &&
 | |
|               (suboptions & PCRE2_SUBSTITUTE_UNKNOWN_UNSET) != 0)
 | |
|             {
 | |
|             rc = PCRE2_ERROR_UNSET;
 | |
|             }
 | |
|           if (rc != PCRE2_ERROR_UNSET) goto PTREXIT;  /* Non-unset errors */
 | |
|           if (special == 0)                           /* Plain substitution */
 | |
|             {
 | |
|             if ((suboptions & PCRE2_SUBSTITUTE_UNSET_EMPTY) != 0) continue;
 | |
|             goto PTREXIT;                             /* Else error */
 | |
|             }
 | |
|           }
 | |
| 
 | |
|         /* If special is '+' we have a 'set' and possibly an 'unset' text,
 | |
|         both of which are reprocessed when used. If special is '-' we have a
 | |
|         default text for when the group is unset; it must be reprocessed. */
 | |
| 
 | |
|         if (special != 0)
 | |
|           {
 | |
|           if (special == CHAR_MINUS)
 | |
|             {
 | |
|             if (rc == 0) goto LITERAL_SUBSTITUTE;
 | |
|             text2_start = text1_start;
 | |
|             text2_end = text1_end;
 | |
|             }
 | |
| 
 | |
|           if (ptrstackptr >= PTR_STACK_SIZE) goto BAD;
 | |
|           ptrstack[ptrstackptr++] = ptr;
 | |
|           ptrstack[ptrstackptr++] = repend;
 | |
| 
 | |
|           if (rc == 0)
 | |
|             {
 | |
|             ptr = text1_start;
 | |
|             repend = text1_end;
 | |
|             }
 | |
|           else
 | |
|             {
 | |
|             ptr = text2_start;
 | |
|             repend = text2_end;
 | |
|             }
 | |
|           continue;
 | |
|           }
 | |
| 
 | |
|         /* Otherwise we have a literal substitution of a group's contents. */
 | |
| 
 | |
|         LITERAL_SUBSTITUTE:
 | |
|         subptr = subject + ovector[group*2];
 | |
|         subptrend = subject + ovector[group*2 + 1];
 | |
| 
 | |
|         /* Substitute a literal string, possibly forcing alphabetic case. */
 | |
| 
 | |
|         while (subptr < subptrend)
 | |
|           {
 | |
|           GETCHARINCTEST(ch, subptr);
 | |
|           if (forcecase != 0)
 | |
|             {
 | |
| #ifdef SUPPORT_UNICODE
 | |
|             if (utf || ucp)
 | |
|               {
 | |
|               uint32_t type = UCD_CHARTYPE(ch);
 | |
|               if (PRIV(ucp_gentype)[type] == ucp_L &&
 | |
|                   type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
 | |
|                 ch = UCD_OTHERCASE(ch);
 | |
|               }
 | |
|             else
 | |
| #endif
 | |
|               {
 | |
|               if (((code->tables + cbits_offset +
 | |
|                   ((forcecase > 0)? cbit_upper:cbit_lower)
 | |
|                   )[ch/8] & (1u << (ch%8))) == 0)
 | |
|                 ch = (code->tables + fcc_offset)[ch];
 | |
|               }
 | |
|             forcecase = forcecasereset;
 | |
|             }
 | |
| 
 | |
| #ifdef SUPPORT_UNICODE
 | |
|           if (utf) chlen = PRIV(ord2utf)(ch, temp); else
 | |
| #endif
 | |
|             {
 | |
|             temp[0] = ch;
 | |
|             chlen = 1;
 | |
|             }
 | |
|           CHECKMEMCPY(temp, chlen);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
| 
 | |
|     /* Handle an escape sequence in extended mode. We can use check_escape()
 | |
|     to process \Q, \E, \c, \o, \x and \ followed by non-alphanumerics, but
 | |
|     the case-forcing escapes are not supported in pcre2_compile() so must be
 | |
|     recognized here. */
 | |
| 
 | |
|     else if ((suboptions & PCRE2_SUBSTITUTE_EXTENDED) != 0 &&
 | |
|               *ptr == CHAR_BACKSLASH)
 | |
|       {
 | |
|       int errorcode;
 | |
| 
 | |
|       if (ptr < repend - 1) switch (ptr[1])
 | |
|         {
 | |
|         case CHAR_L:
 | |
|         forcecase = forcecasereset = -1;
 | |
|         ptr += 2;
 | |
|         continue;
 | |
| 
 | |
|         case CHAR_l:
 | |
|         forcecase = -1;
 | |
|         forcecasereset = 0;
 | |
|         ptr += 2;
 | |
|         continue;
 | |
| 
 | |
|         case CHAR_U:
 | |
|         forcecase = forcecasereset = 1;
 | |
|         ptr += 2;
 | |
|         continue;
 | |
| 
 | |
|         case CHAR_u:
 | |
|         forcecase = 1;
 | |
|         forcecasereset = 0;
 | |
|         ptr += 2;
 | |
|         continue;
 | |
| 
 | |
|         default:
 | |
|         break;
 | |
|         }
 | |
| 
 | |
|       ptr++;  /* Point after \ */
 | |
|       rc = PRIV(check_escape)(&ptr, repend, &ch, &errorcode,
 | |
|         code->overall_options, code->extra_options, FALSE, NULL);
 | |
|       if (errorcode != 0) goto BADESCAPE;
 | |
| 
 | |
|       switch(rc)
 | |
|         {
 | |
|         case ESC_E:
 | |
|         forcecase = forcecasereset = 0;
 | |
|         continue;
 | |
| 
 | |
|         case ESC_Q:
 | |
|         escaped_literal = TRUE;
 | |
|         continue;
 | |
| 
 | |
|         case 0:      /* Data character */
 | |
|         goto LITERAL;
 | |
| 
 | |
|         default:
 | |
|         goto BADESCAPE;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|     /* Handle a literal code unit */
 | |
| 
 | |
|     else
 | |
|       {
 | |
|       LOADLITERAL:
 | |
|       GETCHARINCTEST(ch, ptr);    /* Get character value, increment pointer */
 | |
| 
 | |
|       LITERAL:
 | |
|       if (forcecase != 0)
 | |
|         {
 | |
| #ifdef SUPPORT_UNICODE
 | |
|         if (utf || ucp)
 | |
|           {
 | |
|           uint32_t type = UCD_CHARTYPE(ch);
 | |
|           if (PRIV(ucp_gentype)[type] == ucp_L &&
 | |
|               type != ((forcecase > 0)? ucp_Lu : ucp_Ll))
 | |
|             ch = UCD_OTHERCASE(ch);
 | |
|           }
 | |
|         else
 | |
| #endif
 | |
|           {
 | |
|           if (((code->tables + cbits_offset +
 | |
|               ((forcecase > 0)? cbit_upper:cbit_lower)
 | |
|               )[ch/8] & (1u << (ch%8))) == 0)
 | |
|             ch = (code->tables + fcc_offset)[ch];
 | |
|           }
 | |
|         forcecase = forcecasereset;
 | |
|         }
 | |
| 
 | |
| #ifdef SUPPORT_UNICODE
 | |
|       if (utf) chlen = PRIV(ord2utf)(ch, temp); else
 | |
| #endif
 | |
|         {
 | |
|         temp[0] = ch;
 | |
|         chlen = 1;
 | |
|         }
 | |
|       CHECKMEMCPY(temp, chlen);
 | |
|       } /* End handling a literal code unit */
 | |
|     }   /* End of loop for scanning the replacement. */
 | |
| 
 | |
|   /* The replacement has been copied to the output, or its size has been
 | |
|   remembered. Do the callout if there is one and we have done an actual
 | |
|   replacement. */
 | |
| 
 | |
|   if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL)
 | |
|     {
 | |
|     scb.subscount = subs;
 | |
|     scb.output_offsets[1] = buff_offset;
 | |
|     rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
 | |
| 
 | |
|     /* A non-zero return means cancel this substitution. Instead, copy the
 | |
|     matched string fragment. */
 | |
| 
 | |
|     if (rc != 0)
 | |
|       {
 | |
|       PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0];
 | |
|       PCRE2_SIZE oldlength = ovector[1] - ovector[0];
 | |
| 
 | |
|       buff_offset -= newlength;
 | |
|       lengthleft += newlength;
 | |
|       if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength);
 | |
| 
 | |
|       /* A negative return means do not do any more. */
 | |
| 
 | |
|       if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|   /* Save the details of this match. See above for how this data is used. If we
 | |
|   matched an empty string, do the magic for global matches. Update the start
 | |
|   offset to point to the rest of the subject string. If we re-used an existing
 | |
|   match for the first match, switch to the internal match data block. */
 | |
| 
 | |
|   ovecsave[0] = ovector[0];
 | |
|   ovecsave[1] = ovector[1];
 | |
|   ovecsave[2] = start_offset;
 | |
| 
 | |
|   goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
 | |
|     PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
 | |
|   start_offset = ovector[1];
 | |
|   } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0);  /* Repeat "do" loop */
 | |
| 
 | |
| /* Copy the rest of the subject unless not required, and terminate the output
 | |
| with a binary zero. */
 | |
| 
 | |
| if (!replacement_only)
 | |
|   {
 | |
|   fraglength = length - start_offset;
 | |
|   CHECKMEMCPY(subject + start_offset, fraglength);
 | |
|   }
 | |
| 
 | |
| temp[0] = 0;
 | |
| CHECKMEMCPY(temp, 1);
 | |
| 
 | |
| /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set,
 | |
| and matching has carried on after a full buffer, in order to compute the length
 | |
| needed. Otherwise, an overflow generates an immediate error return. */
 | |
| 
 | |
| if (overflowed)
 | |
|   {
 | |
|   rc = PCRE2_ERROR_NOMEMORY;
 | |
|   *blength = buff_length + extra_needed;
 | |
|   }
 | |
| 
 | |
| /* After a successful execution, return the number of substitutions and set the
 | |
| length of buffer used, excluding the trailing zero. */
 | |
| 
 | |
| else
 | |
|   {
 | |
|   rc = subs;
 | |
|   *blength = buff_offset - 1;
 | |
|   }
 | |
| 
 | |
| EXIT:
 | |
| if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data);
 | |
|   else match_data->rc = rc;
 | |
| return rc;
 | |
| 
 | |
| NOROOM:
 | |
| rc = PCRE2_ERROR_NOMEMORY;
 | |
| goto EXIT;
 | |
| 
 | |
| BAD:
 | |
| rc = PCRE2_ERROR_BADREPLACEMENT;
 | |
| goto PTREXIT;
 | |
| 
 | |
| BADESCAPE:
 | |
| rc = PCRE2_ERROR_BADREPESCAPE;
 | |
| 
 | |
| PTREXIT:
 | |
| *blength = (PCRE2_SIZE)(ptr - replacement);
 | |
| goto EXIT;
 | |
| }
 | |
| 
 | |
| /* End of pcre2_substitute.c */
 |