• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1 /*
2  * This file is part of the KDE libraries
3  * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
4  * Copyright (C) 2003,2004 Apple Computer, Inc.
5  * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  *
21  */
22 
23 #include "regexp.h"
24 
25 #include "lexer.h"
26 #include <assert.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 using namespace KJS;
32 
33 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
34 
35 RegExp::RegExp(const UString &p, int f)
36  : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
37 {
38  // Determine whether libpcre has unicode support if need be..
39  if (utf8Support == Unknown) {
40  uint32_t supported;
41  pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
42  utf8Support = (supported & 0x0001) ? Supported : Unsupported;
43  }
44 
45  nrSubPatterns = 0; // determined in match() with POSIX regex.
46 
47  // JS regexps can contain Unicode escape sequences (\uxxxx) which
48  // are rather uncommon elsewhere. As our regexp libs don't understand
49  // them we do the unescaping ourselves internally.
50  // Also make sure to expand out any nulls as pcre_compile
51  // expects null termination..
52  UString intern;
53  const char* const nil = "\\x00";
54  if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
55  bool escape = false;
56  for (int i = 0; i < p.size(); ++i) {
57  UChar c = p[i];
58  if (escape) {
59  escape = false;
60  // we only care about \u
61  if (c == 'u') {
62  // standard unicode escape sequence looks like \uxxxx but
63  // other browsers also accept less then 4 hex digits
64  unsigned short u = 0;
65  int j = 0;
66  for (j = 0; j < 4; ++j) {
67  if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
68  u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
69  ++i;
70  } else {
71  // sequence incomplete. restore index.
72  // TODO: cleaner way to propagate warning
73  fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
74  i -= j;
75  break;
76  }
77  }
78  if (j < 4) {
79  // sequence was incomplete. treat \u as u which IE always
80  // and FF sometimes does.
81  intern.append(UString('u'));
82  } else {
83  c = UChar(u);
84  switch (u) {
85  case 0:
86  // Make sure to encode 0, to avoid terminating the string
87  intern += UString(nil);
88  break;
89  case '^':
90  case '$':
91  case '\\':
92  case '.':
93  case '*':
94  case '+':
95  case '?':
96  case '(': case ')':
97  case '{': case '}':
98  case '[': case ']':
99  case '|':
100  // escape pattern characters have to remain escaped
101  intern.append(UString('\\'));
102  // intentional fallthrough
103  default:
104  intern += UString(&c, 1);
105  break;
106  }
107  }
108  continue;
109  }
110  intern += UString('\\');
111  intern += UString(&c, 1);
112  } else {
113  if (c == '\\')
114  escape = true;
115  else if (c == '\0')
116  intern += UString(nil);
117  else
118  intern += UString(&c, 1);
119  }
120  }
121  } else {
122  intern = p;
123  }
124 
125 #ifdef HAVE_PCRE2POSIX
126  uint32_t pcre2flags = 0;
127  int errorCode;
128  PCRE2_SIZE errorOffset;
129 
130  if (flgs & IgnoreCase)
131  pcre2flags |= PCRE2_CASELESS;
132 
133  if (flgs & Multiline)
134  pcre2flags |= PCRE2_MULTILINE;
135 
136  if (utf8Support == Supported)
137  pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
138 
139  // Fill our buffer with an encoded version, whether utf-8, or,
140  // if PCRE is incapable, truncated.
141  prepareMatch(intern);
142 
143  pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
144  &errorCode, &errorOffset, NULL);
145  doneMatch(); // Cleanup buffers
146  if (!pcregex) {
147 #ifndef NDEBUG
148  PCRE2_UCHAR errorMsg[256];
149  pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
150  fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
151 #endif
152  match_data = nullptr;
153  valid = false;
154  return;
155  }
156 
157  // Get number of subpatterns that will be returned
158  int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
159  if (rc != 0)
160  {
161  nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
162  }
163 
164  match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
165 #else
166 
167  int regflags = 0;
168 #ifdef REG_EXTENDED
169  regflags |= REG_EXTENDED;
170 #endif
171 #ifdef REG_ICASE
172  if ( f & IgnoreCase )
173  regflags |= REG_ICASE;
174 #endif
175 
176  //NOTE: Multiline is not feasible with POSIX regex.
177  //if ( f & Multiline )
178  // ;
179  // Note: the Global flag is already handled by RegExpProtoFunc::execute
180 
181  int errorCode = regcomp(&preg, intern.ascii(), regflags);
182  if (errorCode != 0) {
183 #ifndef NDEBUG
184  char errorMessage[80];
185  regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
186  fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
187 #endif
188  valid = false;
189  }
190 #endif
191 }
192 
193 RegExp::~RegExp()
194 {
195  doneMatch(); // Be 100% sure buffers are freed
196 #ifdef HAVE_PCRE2POSIX
197  if (match_data)
198  {
199  pcre2_match_data_free(match_data);
200  }
201  if (pcregex)
202  {
203  pcre2_code_free(pcregex);
204  }
205 #else
206  /* TODO: is this really okay after an error ? */
207  regfree(&preg);
208 #endif
209 }
210 
211 void RegExp::prepareUtf8(const UString& s)
212 {
213  // Allocate a buffer big enough to hold all the characters plus \0
214  const int length = s.size();
215  buffer = new buftype_t[length * 3 + 1];
216 
217  // Also create buffer for positions. We need one extra character in there,
218  // even past the \0 since the non-empty handling may jump one past the end
219  originalPos = new int[length * 3 + 2];
220 
221  // Convert to runs of 8-bit characters, and generate indeces
222  // Note that we do NOT combine surrogate pairs here, as
223  // regexps operate on them as separate characters
224  buftype_t *p = buffer;
225  int *posOut = originalPos;
226  const UChar *d = s.data();
227  for (int i = 0; i != length; ++i) {
228  unsigned short c = d[i].unicode();
229 
230  int sequenceLen;
231  if (c < 0x80) {
232  *p++ = (buftype_t)c;
233  sequenceLen = 1;
234  } else if (c < 0x800) {
235  *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
236  *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
237  sequenceLen = 2;
238  } else {
239  *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
240  *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
241  *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
242  sequenceLen = 3;
243  }
244 
245  while (sequenceLen > 0) {
246  *posOut = i;
247  ++posOut;
248  --sequenceLen;
249  }
250  }
251 
252  bufferSize = p - buffer;
253 
254  *p++ = '\0';
255 
256  // Record positions for \0, and the fictional character after that.
257  *posOut = length;
258  *(posOut+1) = length+1;
259 }
260 
261 void RegExp::prepareASCII (const UString& s)
262 {
263  originalPos = 0;
264 
265  // Best-effort attempt to get something done
266  // when we don't have utf 8 available -- use
267  // truncated version, and pray for the best
268  CString truncated = s.cstring();
269  buffer = new buftype_t[truncated.size() + 1];
270  memcpy(buffer, truncated.c_str(), truncated.size());
271  buffer[truncated.size()] = '\0'; // For _compile use
272  bufferSize = truncated.size();
273 }
274 
275 void RegExp::prepareMatch(const UString &s)
276 {
277  delete[] originalPos; // Just to be sure..
278  delete[] buffer;
279  if (utf8Support == Supported)
280  prepareUtf8(s);
281  else
282  prepareASCII(s);
283 
284 #ifndef NDEBUG
285  originalS = s;
286 #endif
287 }
288 
289 void RegExp::doneMatch()
290 {
291  delete[] originalPos; originalPos = 0;
292  delete[] buffer; buffer = 0;
293 }
294 
295 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
296 {
297 #ifndef NDEBUG
298  assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
299 #endif
300  assert(valid);
301 
302  if (i < 0)
303  i = 0;
304  if (ovector)
305  *ovector = 0L;
306  int dummyPos;
307  if (!pos)
308  pos = &dummyPos;
309  *pos = -1;
310  if (i > s.size() || s.isNull())
311  return UString::null;
312 
313 #ifdef HAVE_PCRE2POSIX
314  if (!pcregex || !match_data)
315  return UString::null;
316  if (!ovector)
317  return UString::null;
318 
319  int startPos;
320  int nextPos;
321  if (utf8Support == Supported)
322  {
323  startPos = i;
324  while (originalPos[startPos] < i)
325  ++startPos;
326 
327  nextPos = startPos;
328  if (i < s.size()) {
329  while (originalPos[nextPos] < (i + 1))
330  ++nextPos;
331  }
332  }
333  else
334  {
335  startPos = i;
336  nextPos = i + (i < s.size() ? 1 : 0);
337  }
338 
339  uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
340  if (m_notEmpty)
341  {
342  baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
343  }
344  int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
345  if (numMatches <= 0)
346  {
347  // Failed to match.
348  if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
349  {
350  // We set m_notEmpty ourselves, to look for a non-empty match
351  // So we don't stop here, we want to try again at i+1.
352 #ifdef KJS_VERBOSE
353  fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
354 #endif
355  m_notEmpty = 0;
356  baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
357  numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
358  if (numMatches <= 0)
359  return UString::null;
360  }
361  else
362  return UString::null;
363  }
364 
365  PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
366  if (!pcre2_ovector)
367  return UString::null;
368 
369  uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
370  *ovector = new int[pcre2_ovecCount * 2];
371  if (originalPos)
372  {
373  for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
374  {
375  (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
376  }
377  }
378  else
379  {
380  for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
381  {
382  (*ovector)[c] = pcre2_ovector[c];
383  }
384  }
385 #else
386  const uint maxMatch = 10;
387  regmatch_t rmatch[maxMatch];
388 
389  char *str = strdup(s.ascii()); // TODO: why ???
390  if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
391  free(str);
392  return UString::null;
393  }
394  free(str);
395 
396  if (!ovector) {
397  *pos = rmatch[0].rm_so + i;
398  return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
399  }
400 
401  // map rmatch array to ovector used in PCRE case
402  nrSubPatterns = 0;
403  for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
404  nrSubPatterns++;
405  // if the nonEmpty flag is set, return a failed match if any of the
406  // subMatches happens to be an empty string.
407  if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
408  return UString::null;
409  }
410  // Allow an ovector slot to return the (failed) match result.
411  if (nrSubPatterns == 0) nrSubPatterns = 1;
412 
413  int ovecsize = (nrSubPatterns)*3; // see above
414  *ovector = new int[ovecsize];
415  for (uint j = 0; j < nrSubPatterns; j++) {
416  (*ovector)[2*j] = rmatch[j].rm_so + i;
417  (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
418  }
419 #endif
420 
421  *pos = (*ovector)[0];
422  if ( *pos == (*ovector)[1] && (flgs & Global) )
423  {
424  // empty match, next try will be with m_notEmpty=true
425  m_notEmpty=true;
426  }
427  return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
428 }
KJS::CString
8 bit char based string class
Definition: ustring.h:165
KJS::UString
Unicode string class.
Definition: ustring.h:189
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:798
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:485
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:343
KJS::UString::data
const UChar * data() const
Definition: ustring.h:339
KJS::UString::size
int size() const
Definition: ustring.h:359
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:868
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:480
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:457
KJS::UChar
Unicode character.
Definition: ustring.h:51
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:81

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •   kspeech
  •     interface
  •     library
  •   tdetexteditor
  • kate
  • kded
  • kdoctools
  • kimgio
  • kjs
  • libtdemid
  • libtdescreensaver
  • tdeabc
  • tdecmshell
  • tdecore
  • tdefx
  • tdehtml
  • tdeinit
  • tdeio
  •   bookmarks
  •   httpfilter
  •   kpasswdserver
  •   kssl
  •   tdefile
  •   tdeio
  •   tdeioexec
  • tdeioslave
  •   http
  • tdemdi
  •   tdemdi
  • tdenewstuff
  • tdeparts
  • tdeprint
  • tderandr
  • tderesources
  • tdespell2
  • tdesu
  • tdeui
  • tdeunittest
  • tdeutils
  • tdewallet
Generated for kjs by doxygen 1.9.1
This website is maintained by Timothy Pearson.