logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

sre_parse.py (40230B)


  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. """Internal support module for sre"""
  11. # XXX: show string offset and offending character for all errors
  12. from sre_constants import *
  13. SPECIAL_CHARS = ".\\[{()*+?^$|"
  14. REPEAT_CHARS = "*+?{"
  15. DIGITS = frozenset("0123456789")
  16. OCTDIGITS = frozenset("01234567")
  17. HEXDIGITS = frozenset("0123456789abcdefABCDEF")
  18. ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  19. WHITESPACE = frozenset(" \t\n\r\v\f")
  20. _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  21. _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  22. ESCAPES = {
  23. r"\a": (LITERAL, ord("\a")),
  24. r"\b": (LITERAL, ord("\b")),
  25. r"\f": (LITERAL, ord("\f")),
  26. r"\n": (LITERAL, ord("\n")),
  27. r"\r": (LITERAL, ord("\r")),
  28. r"\t": (LITERAL, ord("\t")),
  29. r"\v": (LITERAL, ord("\v")),
  30. r"\\": (LITERAL, ord("\\"))
  31. }
  32. CATEGORIES = {
  33. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  34. r"\b": (AT, AT_BOUNDARY),
  35. r"\B": (AT, AT_NON_BOUNDARY),
  36. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  37. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  38. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  39. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  40. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  41. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  42. r"\Z": (AT, AT_END_STRING), # end of string
  43. }
  44. FLAGS = {
  45. # standard flags
  46. "i": SRE_FLAG_IGNORECASE,
  47. "L": SRE_FLAG_LOCALE,
  48. "m": SRE_FLAG_MULTILINE,
  49. "s": SRE_FLAG_DOTALL,
  50. "x": SRE_FLAG_VERBOSE,
  51. # extensions
  52. "a": SRE_FLAG_ASCII,
  53. "t": SRE_FLAG_TEMPLATE,
  54. "u": SRE_FLAG_UNICODE,
  55. }
  56. TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
  57. GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
  58. class Verbose(Exception):
  59. pass
  60. class State:
  61. # keeps track of state for parsing
  62. def __init__(self):
  63. self.flags = 0
  64. self.groupdict = {}
  65. self.groupwidths = [None] # group 0
  66. self.lookbehindgroups = None
  67. @property
  68. def groups(self):
  69. return len(self.groupwidths)
  70. def opengroup(self, name=None):
  71. gid = self.groups
  72. self.groupwidths.append(None)
  73. if self.groups > MAXGROUPS:
  74. raise error("too many groups")
  75. if name is not None:
  76. ogid = self.groupdict.get(name, None)
  77. if ogid is not None:
  78. raise error("redefinition of group name %r as group %d; "
  79. "was group %d" % (name, gid, ogid))
  80. self.groupdict[name] = gid
  81. return gid
  82. def closegroup(self, gid, p):
  83. self.groupwidths[gid] = p.getwidth()
  84. def checkgroup(self, gid):
  85. return gid < self.groups and self.groupwidths[gid] is not None
  86. def checklookbehindgroup(self, gid, source):
  87. if self.lookbehindgroups is not None:
  88. if not self.checkgroup(gid):
  89. raise source.error('cannot refer to an open group')
  90. if gid >= self.lookbehindgroups:
  91. raise source.error('cannot refer to group defined in the same '
  92. 'lookbehind subpattern')
  93. class SubPattern:
  94. # a subpattern, in intermediate form
  95. def __init__(self, state, data=None):
  96. self.state = state
  97. if data is None:
  98. data = []
  99. self.data = data
  100. self.width = None
  101. def dump(self, level=0):
  102. nl = True
  103. seqtypes = (tuple, list)
  104. for op, av in self.data:
  105. print(level*" " + str(op), end='')
  106. if op is IN:
  107. # member sublanguage
  108. print()
  109. for op, a in av:
  110. print((level+1)*" " + str(op), a)
  111. elif op is BRANCH:
  112. print()
  113. for i, a in enumerate(av[1]):
  114. if i:
  115. print(level*" " + "OR")
  116. a.dump(level+1)
  117. elif op is GROUPREF_EXISTS:
  118. condgroup, item_yes, item_no = av
  119. print('', condgroup)
  120. item_yes.dump(level+1)
  121. if item_no:
  122. print(level*" " + "ELSE")
  123. item_no.dump(level+1)
  124. elif isinstance(av, seqtypes):
  125. nl = False
  126. for a in av:
  127. if isinstance(a, SubPattern):
  128. if not nl:
  129. print()
  130. a.dump(level+1)
  131. nl = True
  132. else:
  133. if not nl:
  134. print(' ', end='')
  135. print(a, end='')
  136. nl = False
  137. if not nl:
  138. print()
  139. else:
  140. print('', av)
  141. def __repr__(self):
  142. return repr(self.data)
  143. def __len__(self):
  144. return len(self.data)
  145. def __delitem__(self, index):
  146. del self.data[index]
  147. def __getitem__(self, index):
  148. if isinstance(index, slice):
  149. return SubPattern(self.state, self.data[index])
  150. return self.data[index]
  151. def __setitem__(self, index, code):
  152. self.data[index] = code
  153. def insert(self, index, code):
  154. self.data.insert(index, code)
  155. def append(self, code):
  156. self.data.append(code)
  157. def getwidth(self):
  158. # determine the width (min, max) for this subpattern
  159. if self.width is not None:
  160. return self.width
  161. lo = hi = 0
  162. for op, av in self.data:
  163. if op is BRANCH:
  164. i = MAXREPEAT - 1
  165. j = 0
  166. for av in av[1]:
  167. l, h = av.getwidth()
  168. i = min(i, l)
  169. j = max(j, h)
  170. lo = lo + i
  171. hi = hi + j
  172. elif op is CALL:
  173. i, j = av.getwidth()
  174. lo = lo + i
  175. hi = hi + j
  176. elif op is SUBPATTERN:
  177. i, j = av[-1].getwidth()
  178. lo = lo + i
  179. hi = hi + j
  180. elif op in _REPEATCODES:
  181. i, j = av[2].getwidth()
  182. lo = lo + i * av[0]
  183. hi = hi + j * av[1]
  184. elif op in _UNITCODES:
  185. lo = lo + 1
  186. hi = hi + 1
  187. elif op is GROUPREF:
  188. i, j = self.state.groupwidths[av]
  189. lo = lo + i
  190. hi = hi + j
  191. elif op is GROUPREF_EXISTS:
  192. i, j = av[1].getwidth()
  193. if av[2] is not None:
  194. l, h = av[2].getwidth()
  195. i = min(i, l)
  196. j = max(j, h)
  197. else:
  198. i = 0
  199. lo = lo + i
  200. hi = hi + j
  201. elif op is SUCCESS:
  202. break
  203. self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
  204. return self.width
  205. class Tokenizer:
  206. def __init__(self, string):
  207. self.istext = isinstance(string, str)
  208. self.string = string
  209. if not self.istext:
  210. string = str(string, 'latin1')
  211. self.decoded_string = string
  212. self.index = 0
  213. self.next = None
  214. self.__next()
  215. def __next(self):
  216. index = self.index
  217. try:
  218. char = self.decoded_string[index]
  219. except IndexError:
  220. self.next = None
  221. return
  222. if char == "\\":
  223. index += 1
  224. try:
  225. char += self.decoded_string[index]
  226. except IndexError:
  227. raise error("bad escape (end of pattern)",
  228. self.string, len(self.string) - 1) from None
  229. self.index = index + 1
  230. self.next = char
  231. def match(self, char):
  232. if char == self.next:
  233. self.__next()
  234. return True
  235. return False
  236. def get(self):
  237. this = self.next
  238. self.__next()
  239. return this
  240. def getwhile(self, n, charset):
  241. result = ''
  242. for _ in range(n):
  243. c = self.next
  244. if c not in charset:
  245. break
  246. result += c
  247. self.__next()
  248. return result
  249. def getuntil(self, terminator, name):
  250. result = ''
  251. while True:
  252. c = self.next
  253. self.__next()
  254. if c is None:
  255. if not result:
  256. raise self.error("missing " + name)
  257. raise self.error("missing %s, unterminated name" % terminator,
  258. len(result))
  259. if c == terminator:
  260. if not result:
  261. raise self.error("missing " + name, 1)
  262. break
  263. result += c
  264. return result
  265. @property
  266. def pos(self):
  267. return self.index - len(self.next or '')
  268. def tell(self):
  269. return self.index - len(self.next or '')
  270. def seek(self, index):
  271. self.index = index
  272. self.__next()
  273. def error(self, msg, offset=0):
  274. return error(msg, self.string, self.tell() - offset)
  275. def _class_escape(source, escape):
  276. # handle escape code inside character class
  277. code = ESCAPES.get(escape)
  278. if code:
  279. return code
  280. code = CATEGORIES.get(escape)
  281. if code and code[0] is IN:
  282. return code
  283. try:
  284. c = escape[1:2]
  285. if c == "x":
  286. # hexadecimal escape (exactly two digits)
  287. escape += source.getwhile(2, HEXDIGITS)
  288. if len(escape) != 4:
  289. raise source.error("incomplete escape %s" % escape, len(escape))
  290. return LITERAL, int(escape[2:], 16)
  291. elif c == "u" and source.istext:
  292. # unicode escape (exactly four digits)
  293. escape += source.getwhile(4, HEXDIGITS)
  294. if len(escape) != 6:
  295. raise source.error("incomplete escape %s" % escape, len(escape))
  296. return LITERAL, int(escape[2:], 16)
  297. elif c == "U" and source.istext:
  298. # unicode escape (exactly eight digits)
  299. escape += source.getwhile(8, HEXDIGITS)
  300. if len(escape) != 10:
  301. raise source.error("incomplete escape %s" % escape, len(escape))
  302. c = int(escape[2:], 16)
  303. chr(c) # raise ValueError for invalid code
  304. return LITERAL, c
  305. elif c == "N" and source.istext:
  306. import unicodedata
  307. # named unicode escape e.g. \N{EM DASH}
  308. if not source.match('{'):
  309. raise source.error("missing {")
  310. charname = source.getuntil('}', 'character name')
  311. try:
  312. c = ord(unicodedata.lookup(charname))
  313. except KeyError:
  314. raise source.error("undefined character name %r" % charname,
  315. len(charname) + len(r'\N{}'))
  316. return LITERAL, c
  317. elif c in OCTDIGITS:
  318. # octal escape (up to three digits)
  319. escape += source.getwhile(2, OCTDIGITS)
  320. c = int(escape[1:], 8)
  321. if c > 0o377:
  322. raise source.error('octal escape value %s outside of '
  323. 'range 0-0o377' % escape, len(escape))
  324. return LITERAL, c
  325. elif c in DIGITS:
  326. raise ValueError
  327. if len(escape) == 2:
  328. if c in ASCIILETTERS:
  329. raise source.error('bad escape %s' % escape, len(escape))
  330. return LITERAL, ord(escape[1])
  331. except ValueError:
  332. pass
  333. raise source.error("bad escape %s" % escape, len(escape))
  334. def _escape(source, escape, state):
  335. # handle escape code in expression
  336. code = CATEGORIES.get(escape)
  337. if code:
  338. return code
  339. code = ESCAPES.get(escape)
  340. if code:
  341. return code
  342. try:
  343. c = escape[1:2]
  344. if c == "x":
  345. # hexadecimal escape
  346. escape += source.getwhile(2, HEXDIGITS)
  347. if len(escape) != 4:
  348. raise source.error("incomplete escape %s" % escape, len(escape))
  349. return LITERAL, int(escape[2:], 16)
  350. elif c == "u" and source.istext:
  351. # unicode escape (exactly four digits)
  352. escape += source.getwhile(4, HEXDIGITS)
  353. if len(escape) != 6:
  354. raise source.error("incomplete escape %s" % escape, len(escape))
  355. return LITERAL, int(escape[2:], 16)
  356. elif c == "U" and source.istext:
  357. # unicode escape (exactly eight digits)
  358. escape += source.getwhile(8, HEXDIGITS)
  359. if len(escape) != 10:
  360. raise source.error("incomplete escape %s" % escape, len(escape))
  361. c = int(escape[2:], 16)
  362. chr(c) # raise ValueError for invalid code
  363. return LITERAL, c
  364. elif c == "N" and source.istext:
  365. import unicodedata
  366. # named unicode escape e.g. \N{EM DASH}
  367. if not source.match('{'):
  368. raise source.error("missing {")
  369. charname = source.getuntil('}', 'character name')
  370. try:
  371. c = ord(unicodedata.lookup(charname))
  372. except KeyError:
  373. raise source.error("undefined character name %r" % charname,
  374. len(charname) + len(r'\N{}'))
  375. return LITERAL, c
  376. elif c == "0":
  377. # octal escape
  378. escape += source.getwhile(2, OCTDIGITS)
  379. return LITERAL, int(escape[1:], 8)
  380. elif c in DIGITS:
  381. # octal escape *or* decimal group reference (sigh)
  382. if source.next in DIGITS:
  383. escape += source.get()
  384. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  385. source.next in OCTDIGITS):
  386. # got three octal digits; this is an octal escape
  387. escape += source.get()
  388. c = int(escape[1:], 8)
  389. if c > 0o377:
  390. raise source.error('octal escape value %s outside of '
  391. 'range 0-0o377' % escape,
  392. len(escape))
  393. return LITERAL, c
  394. # not an octal escape, so this is a group reference
  395. group = int(escape[1:])
  396. if group < state.groups:
  397. if not state.checkgroup(group):
  398. raise source.error("cannot refer to an open group",
  399. len(escape))
  400. state.checklookbehindgroup(group, source)
  401. return GROUPREF, group
  402. raise source.error("invalid group reference %d" % group, len(escape) - 1)
  403. if len(escape) == 2:
  404. if c in ASCIILETTERS:
  405. raise source.error("bad escape %s" % escape, len(escape))
  406. return LITERAL, ord(escape[1])
  407. except ValueError:
  408. pass
  409. raise source.error("bad escape %s" % escape, len(escape))
  410. def _uniq(items):
  411. return list(dict.fromkeys(items))
  412. def _parse_sub(source, state, verbose, nested):
  413. # parse an alternation: a|b|c
  414. items = []
  415. itemsappend = items.append
  416. sourcematch = source.match
  417. start = source.tell()
  418. while True:
  419. itemsappend(_parse(source, state, verbose, nested + 1,
  420. not nested and not items))
  421. if not sourcematch("|"):
  422. break
  423. if len(items) == 1:
  424. return items[0]
  425. subpattern = SubPattern(state)
  426. # check if all items share a common prefix
  427. while True:
  428. prefix = None
  429. for item in items:
  430. if not item:
  431. break
  432. if prefix is None:
  433. prefix = item[0]
  434. elif item[0] != prefix:
  435. break
  436. else:
  437. # all subitems start with a common "prefix".
  438. # move it out of the branch
  439. for item in items:
  440. del item[0]
  441. subpattern.append(prefix)
  442. continue # check next one
  443. break
  444. # check if the branch can be replaced by a character set
  445. set = []
  446. for item in items:
  447. if len(item) != 1:
  448. break
  449. op, av = item[0]
  450. if op is LITERAL:
  451. set.append((op, av))
  452. elif op is IN and av[0][0] is not NEGATE:
  453. set.extend(av)
  454. else:
  455. break
  456. else:
  457. # we can store this as a character set instead of a
  458. # branch (the compiler may optimize this even more)
  459. subpattern.append((IN, _uniq(set)))
  460. return subpattern
  461. subpattern.append((BRANCH, (None, items)))
  462. return subpattern
  463. def _parse(source, state, verbose, nested, first=False):
  464. # parse a simple pattern
  465. subpattern = SubPattern(state)
  466. # precompute constants into local variables
  467. subpatternappend = subpattern.append
  468. sourceget = source.get
  469. sourcematch = source.match
  470. _len = len
  471. _ord = ord
  472. while True:
  473. this = source.next
  474. if this is None:
  475. break # end of pattern
  476. if this in "|)":
  477. break # end of subpattern
  478. sourceget()
  479. if verbose:
  480. # skip whitespace and comments
  481. if this in WHITESPACE:
  482. continue
  483. if this == "#":
  484. while True:
  485. this = sourceget()
  486. if this is None or this == "\n":
  487. break
  488. continue
  489. if this[0] == "\\":
  490. code = _escape(source, this, state)
  491. subpatternappend(code)
  492. elif this not in SPECIAL_CHARS:
  493. subpatternappend((LITERAL, _ord(this)))
  494. elif this == "[":
  495. here = source.tell() - 1
  496. # character set
  497. set = []
  498. setappend = set.append
  499. ## if sourcematch(":"):
  500. ## pass # handle character classes
  501. if source.next == '[':
  502. import warnings
  503. warnings.warn(
  504. 'Possible nested set at position %d' % source.tell(),
  505. FutureWarning, stacklevel=nested + 6
  506. )
  507. negate = sourcematch("^")
  508. # check remaining characters
  509. while True:
  510. this = sourceget()
  511. if this is None:
  512. raise source.error("unterminated character set",
  513. source.tell() - here)
  514. if this == "]" and set:
  515. break
  516. elif this[0] == "\\":
  517. code1 = _class_escape(source, this)
  518. else:
  519. if set and this in '-&~|' and source.next == this:
  520. import warnings
  521. warnings.warn(
  522. 'Possible set %s at position %d' % (
  523. 'difference' if this == '-' else
  524. 'intersection' if this == '&' else
  525. 'symmetric difference' if this == '~' else
  526. 'union',
  527. source.tell() - 1),
  528. FutureWarning, stacklevel=nested + 6
  529. )
  530. code1 = LITERAL, _ord(this)
  531. if sourcematch("-"):
  532. # potential range
  533. that = sourceget()
  534. if that is None:
  535. raise source.error("unterminated character set",
  536. source.tell() - here)
  537. if that == "]":
  538. if code1[0] is IN:
  539. code1 = code1[1][0]
  540. setappend(code1)
  541. setappend((LITERAL, _ord("-")))
  542. break
  543. if that[0] == "\\":
  544. code2 = _class_escape(source, that)
  545. else:
  546. if that == '-':
  547. import warnings
  548. warnings.warn(
  549. 'Possible set difference at position %d' % (
  550. source.tell() - 2),
  551. FutureWarning, stacklevel=nested + 6
  552. )
  553. code2 = LITERAL, _ord(that)
  554. if code1[0] != LITERAL or code2[0] != LITERAL:
  555. msg = "bad character range %s-%s" % (this, that)
  556. raise source.error(msg, len(this) + 1 + len(that))
  557. lo = code1[1]
  558. hi = code2[1]
  559. if hi < lo:
  560. msg = "bad character range %s-%s" % (this, that)
  561. raise source.error(msg, len(this) + 1 + len(that))
  562. setappend((RANGE, (lo, hi)))
  563. else:
  564. if code1[0] is IN:
  565. code1 = code1[1][0]
  566. setappend(code1)
  567. set = _uniq(set)
  568. # XXX: <fl> should move set optimization to compiler!
  569. if _len(set) == 1 and set[0][0] is LITERAL:
  570. # optimization
  571. if negate:
  572. subpatternappend((NOT_LITERAL, set[0][1]))
  573. else:
  574. subpatternappend(set[0])
  575. else:
  576. if negate:
  577. set.insert(0, (NEGATE, None))
  578. # charmap optimization can't be added here because
  579. # global flags still are not known
  580. subpatternappend((IN, set))
  581. elif this in REPEAT_CHARS:
  582. # repeat previous item
  583. here = source.tell()
  584. if this == "?":
  585. min, max = 0, 1
  586. elif this == "*":
  587. min, max = 0, MAXREPEAT
  588. elif this == "+":
  589. min, max = 1, MAXREPEAT
  590. elif this == "{":
  591. if source.next == "}":
  592. subpatternappend((LITERAL, _ord(this)))
  593. continue
  594. min, max = 0, MAXREPEAT
  595. lo = hi = ""
  596. while source.next in DIGITS:
  597. lo += sourceget()
  598. if sourcematch(","):
  599. while source.next in DIGITS:
  600. hi += sourceget()
  601. else:
  602. hi = lo
  603. if not sourcematch("}"):
  604. subpatternappend((LITERAL, _ord(this)))
  605. source.seek(here)
  606. continue
  607. if lo:
  608. min = int(lo)
  609. if min >= MAXREPEAT:
  610. raise OverflowError("the repetition number is too large")
  611. if hi:
  612. max = int(hi)
  613. if max >= MAXREPEAT:
  614. raise OverflowError("the repetition number is too large")
  615. if max < min:
  616. raise source.error("min repeat greater than max repeat",
  617. source.tell() - here)
  618. else:
  619. raise AssertionError("unsupported quantifier %r" % (char,))
  620. # figure out which item to repeat
  621. if subpattern:
  622. item = subpattern[-1:]
  623. else:
  624. item = None
  625. if not item or item[0][0] is AT:
  626. raise source.error("nothing to repeat",
  627. source.tell() - here + len(this))
  628. if item[0][0] in _REPEATCODES:
  629. raise source.error("multiple repeat",
  630. source.tell() - here + len(this))
  631. if item[0][0] is SUBPATTERN:
  632. group, add_flags, del_flags, p = item[0][1]
  633. if group is None and not add_flags and not del_flags:
  634. item = p
  635. if sourcematch("?"):
  636. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  637. else:
  638. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  639. elif this == ".":
  640. subpatternappend((ANY, None))
  641. elif this == "(":
  642. start = source.tell() - 1
  643. group = True
  644. name = None
  645. add_flags = 0
  646. del_flags = 0
  647. if sourcematch("?"):
  648. # options
  649. char = sourceget()
  650. if char is None:
  651. raise source.error("unexpected end of pattern")
  652. if char == "P":
  653. # python extensions
  654. if sourcematch("<"):
  655. # named group: skip forward to end of name
  656. name = source.getuntil(">", "group name")
  657. if not name.isidentifier():
  658. msg = "bad character in group name %r" % name
  659. raise source.error(msg, len(name) + 1)
  660. elif sourcematch("="):
  661. # named backreference
  662. name = source.getuntil(")", "group name")
  663. if not name.isidentifier():
  664. msg = "bad character in group name %r" % name
  665. raise source.error(msg, len(name) + 1)
  666. gid = state.groupdict.get(name)
  667. if gid is None:
  668. msg = "unknown group name %r" % name
  669. raise source.error(msg, len(name) + 1)
  670. if not state.checkgroup(gid):
  671. raise source.error("cannot refer to an open group",
  672. len(name) + 1)
  673. state.checklookbehindgroup(gid, source)
  674. subpatternappend((GROUPREF, gid))
  675. continue
  676. else:
  677. char = sourceget()
  678. if char is None:
  679. raise source.error("unexpected end of pattern")
  680. raise source.error("unknown extension ?P" + char,
  681. len(char) + 2)
  682. elif char == ":":
  683. # non-capturing group
  684. group = None
  685. elif char == "#":
  686. # comment
  687. while True:
  688. if source.next is None:
  689. raise source.error("missing ), unterminated comment",
  690. source.tell() - start)
  691. if sourceget() == ")":
  692. break
  693. continue
  694. elif char in "=!<":
  695. # lookahead assertions
  696. dir = 1
  697. if char == "<":
  698. char = sourceget()
  699. if char is None:
  700. raise source.error("unexpected end of pattern")
  701. if char not in "=!":
  702. raise source.error("unknown extension ?<" + char,
  703. len(char) + 2)
  704. dir = -1 # lookbehind
  705. lookbehindgroups = state.lookbehindgroups
  706. if lookbehindgroups is None:
  707. state.lookbehindgroups = state.groups
  708. p = _parse_sub(source, state, verbose, nested + 1)
  709. if dir < 0:
  710. if lookbehindgroups is None:
  711. state.lookbehindgroups = None
  712. if not sourcematch(")"):
  713. raise source.error("missing ), unterminated subpattern",
  714. source.tell() - start)
  715. if char == "=":
  716. subpatternappend((ASSERT, (dir, p)))
  717. else:
  718. subpatternappend((ASSERT_NOT, (dir, p)))
  719. continue
  720. elif char == "(":
  721. # conditional backreference group
  722. condname = source.getuntil(")", "group name")
  723. if condname.isidentifier():
  724. condgroup = state.groupdict.get(condname)
  725. if condgroup is None:
  726. msg = "unknown group name %r" % condname
  727. raise source.error(msg, len(condname) + 1)
  728. else:
  729. try:
  730. condgroup = int(condname)
  731. if condgroup < 0:
  732. raise ValueError
  733. except ValueError:
  734. msg = "bad character in group name %r" % condname
  735. raise source.error(msg, len(condname) + 1) from None
  736. if not condgroup:
  737. raise source.error("bad group number",
  738. len(condname) + 1)
  739. if condgroup >= MAXGROUPS:
  740. msg = "invalid group reference %d" % condgroup
  741. raise source.error(msg, len(condname) + 1)
  742. state.checklookbehindgroup(condgroup, source)
  743. item_yes = _parse(source, state, verbose, nested + 1)
  744. if source.match("|"):
  745. item_no = _parse(source, state, verbose, nested + 1)
  746. if source.next == "|":
  747. raise source.error("conditional backref with more than two branches")
  748. else:
  749. item_no = None
  750. if not source.match(")"):
  751. raise source.error("missing ), unterminated subpattern",
  752. source.tell() - start)
  753. subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
  754. continue
  755. elif char in FLAGS or char == "-":
  756. # flags
  757. flags = _parse_flags(source, state, char)
  758. if flags is None: # global flags
  759. if not first or subpattern:
  760. import warnings
  761. warnings.warn(
  762. 'Flags not at the start of the expression %r%s' % (
  763. source.string[:20], # truncate long regexes
  764. ' (truncated)' if len(source.string) > 20 else '',
  765. ),
  766. DeprecationWarning, stacklevel=nested + 6
  767. )
  768. if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
  769. raise Verbose
  770. continue
  771. add_flags, del_flags = flags
  772. group = None
  773. else:
  774. raise source.error("unknown extension ?" + char,
  775. len(char) + 1)
  776. # parse group contents
  777. if group is not None:
  778. try:
  779. group = state.opengroup(name)
  780. except error as err:
  781. raise source.error(err.msg, len(name) + 1) from None
  782. sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
  783. not (del_flags & SRE_FLAG_VERBOSE))
  784. p = _parse_sub(source, state, sub_verbose, nested + 1)
  785. if not source.match(")"):
  786. raise source.error("missing ), unterminated subpattern",
  787. source.tell() - start)
  788. if group is not None:
  789. state.closegroup(group, p)
  790. subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
  791. elif this == "^":
  792. subpatternappend((AT, AT_BEGINNING))
  793. elif this == "$":
  794. subpatternappend((AT, AT_END))
  795. else:
  796. raise AssertionError("unsupported special character %r" % (char,))
  797. # unpack non-capturing groups
  798. for i in range(len(subpattern))[::-1]:
  799. op, av = subpattern[i]
  800. if op is SUBPATTERN:
  801. group, add_flags, del_flags, p = av
  802. if group is None and not add_flags and not del_flags:
  803. subpattern[i: i+1] = p
  804. return subpattern
  805. def _parse_flags(source, state, char):
  806. sourceget = source.get
  807. add_flags = 0
  808. del_flags = 0
  809. if char != "-":
  810. while True:
  811. flag = FLAGS[char]
  812. if source.istext:
  813. if char == 'L':
  814. msg = "bad inline flags: cannot use 'L' flag with a str pattern"
  815. raise source.error(msg)
  816. else:
  817. if char == 'u':
  818. msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
  819. raise source.error(msg)
  820. add_flags |= flag
  821. if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
  822. msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
  823. raise source.error(msg)
  824. char = sourceget()
  825. if char is None:
  826. raise source.error("missing -, : or )")
  827. if char in ")-:":
  828. break
  829. if char not in FLAGS:
  830. msg = "unknown flag" if char.isalpha() else "missing -, : or )"
  831. raise source.error(msg, len(char))
  832. if char == ")":
  833. state.flags |= add_flags
  834. return None
  835. if add_flags & GLOBAL_FLAGS:
  836. raise source.error("bad inline flags: cannot turn on global flag", 1)
  837. if char == "-":
  838. char = sourceget()
  839. if char is None:
  840. raise source.error("missing flag")
  841. if char not in FLAGS:
  842. msg = "unknown flag" if char.isalpha() else "missing flag"
  843. raise source.error(msg, len(char))
  844. while True:
  845. flag = FLAGS[char]
  846. if flag & TYPE_FLAGS:
  847. msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
  848. raise source.error(msg)
  849. del_flags |= flag
  850. char = sourceget()
  851. if char is None:
  852. raise source.error("missing :")
  853. if char == ":":
  854. break
  855. if char not in FLAGS:
  856. msg = "unknown flag" if char.isalpha() else "missing :"
  857. raise source.error(msg, len(char))
  858. assert char == ":"
  859. if del_flags & GLOBAL_FLAGS:
  860. raise source.error("bad inline flags: cannot turn off global flag", 1)
  861. if add_flags & del_flags:
  862. raise source.error("bad inline flags: flag turned on and off", 1)
  863. return add_flags, del_flags
  864. def fix_flags(src, flags):
  865. # Check and fix flags according to the type of pattern (str or bytes)
  866. if isinstance(src, str):
  867. if flags & SRE_FLAG_LOCALE:
  868. raise ValueError("cannot use LOCALE flag with a str pattern")
  869. if not flags & SRE_FLAG_ASCII:
  870. flags |= SRE_FLAG_UNICODE
  871. elif flags & SRE_FLAG_UNICODE:
  872. raise ValueError("ASCII and UNICODE flags are incompatible")
  873. else:
  874. if flags & SRE_FLAG_UNICODE:
  875. raise ValueError("cannot use UNICODE flag with a bytes pattern")
  876. if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
  877. raise ValueError("ASCII and LOCALE flags are incompatible")
  878. return flags
  879. def parse(str, flags=0, state=None):
  880. # parse 're' pattern into list of (opcode, argument) tuples
  881. source = Tokenizer(str)
  882. if state is None:
  883. state = State()
  884. state.flags = flags
  885. state.str = str
  886. try:
  887. p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  888. except Verbose:
  889. # the VERBOSE flag was switched on inside the pattern. to be
  890. # on the safe side, we'll parse the whole thing again...
  891. state = State()
  892. state.flags = flags | SRE_FLAG_VERBOSE
  893. state.str = str
  894. source.seek(0)
  895. p = _parse_sub(source, state, True, 0)
  896. p.state.flags = fix_flags(str, p.state.flags)
  897. if source.next is not None:
  898. assert source.next == ")"
  899. raise source.error("unbalanced parenthesis")
  900. if flags & SRE_FLAG_DEBUG:
  901. p.dump()
  902. return p
  903. def parse_template(source, state):
  904. # parse 're' replacement string into list of literals and
  905. # group references
  906. s = Tokenizer(source)
  907. sget = s.get
  908. groups = []
  909. literals = []
  910. literal = []
  911. lappend = literal.append
  912. def addgroup(index, pos):
  913. if index > state.groups:
  914. raise s.error("invalid group reference %d" % index, pos)
  915. if literal:
  916. literals.append(''.join(literal))
  917. del literal[:]
  918. groups.append((len(literals), index))
  919. literals.append(None)
  920. groupindex = state.groupindex
  921. while True:
  922. this = sget()
  923. if this is None:
  924. break # end of replacement string
  925. if this[0] == "\\":
  926. # group
  927. c = this[1]
  928. if c == "g":
  929. name = ""
  930. if not s.match("<"):
  931. raise s.error("missing <")
  932. name = s.getuntil(">", "group name")
  933. if name.isidentifier():
  934. try:
  935. index = groupindex[name]
  936. except KeyError:
  937. raise IndexError("unknown group name %r" % name)
  938. else:
  939. try:
  940. index = int(name)
  941. if index < 0:
  942. raise ValueError
  943. except ValueError:
  944. raise s.error("bad character in group name %r" % name,
  945. len(name) + 1) from None
  946. if index >= MAXGROUPS:
  947. raise s.error("invalid group reference %d" % index,
  948. len(name) + 1)
  949. addgroup(index, len(name) + 1)
  950. elif c == "0":
  951. if s.next in OCTDIGITS:
  952. this += sget()
  953. if s.next in OCTDIGITS:
  954. this += sget()
  955. lappend(chr(int(this[1:], 8) & 0xff))
  956. elif c in DIGITS:
  957. isoctal = False
  958. if s.next in DIGITS:
  959. this += sget()
  960. if (c in OCTDIGITS and this[2] in OCTDIGITS and
  961. s.next in OCTDIGITS):
  962. this += sget()
  963. isoctal = True
  964. c = int(this[1:], 8)
  965. if c > 0o377:
  966. raise s.error('octal escape value %s outside of '
  967. 'range 0-0o377' % this, len(this))
  968. lappend(chr(c))
  969. if not isoctal:
  970. addgroup(int(this[1:]), len(this) - 1)
  971. else:
  972. try:
  973. this = chr(ESCAPES[this][1])
  974. except KeyError:
  975. if c in ASCIILETTERS:
  976. raise s.error('bad escape %s' % this, len(this))
  977. lappend(this)
  978. else:
  979. lappend(this)
  980. if literal:
  981. literals.append(''.join(literal))
  982. if not isinstance(source, str):
  983. # The tokenizer implicitly decodes bytes objects as latin-1, we must
  984. # therefore re-encode the final representation.
  985. literals = [None if s is None else s.encode('latin-1') for s in literals]
  986. return groups, literals
  987. def expand_template(template, match):
  988. g = match.group
  989. empty = match.string[:0]
  990. groups, literals = template
  991. literals = literals[:]
  992. try:
  993. for index, group in groups:
  994. literals[index] = g(group) or empty
  995. except IndexError:
  996. raise error("invalid group reference %d" % index)
  997. return empty.join(literals)