commit: d9d07a95815a992bf5f876a62f25c831eb3f32ac
parent 825a40744bf9aeb743452db24e43d3eb61feb6c2
Author: dirkf <fieldhouse@gmx.net>
Date: Wed, 3 May 2023 12:06:34 +0100
[utils] Improve js_to_json, align with yt-dlp
* support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc,
thanks ChillingPepper, Grub4k, pukkandan
* improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521
thanks Grub4k
* support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623
thanks Grub4k
* add limited `!` evaluation (eg, !!0 -> false, see tests)
Diffstat:
M | test/test_utils.py | 103 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- |
M | youtube_dl/utils.py | 114 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------ |
2 files changed, 187 insertions(+), 30 deletions(-)
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -905,6 +905,85 @@ class TestUtil(unittest.TestCase):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
+ def test_js_to_json_vars_strings(self):
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'null': a,
+ 'nullStr': b,
+ 'true': c,
+ 'trueStr': d,
+ 'false': e,
+ 'falseStr': f,
+ 'unresolvedVar': g,
+ }''',
+ {
+ 'a': 'null',
+ 'b': '"null"',
+ 'c': 'true',
+ 'd': '"true"',
+ 'e': 'false',
+ 'f': '"false"',
+ 'g': 'var',
+ }
+ )),
+ {
+ 'null': None,
+ 'nullStr': 'null',
+ 'true': True,
+ 'trueStr': 'true',
+ 'false': False,
+ 'falseStr': 'false',
+ 'unresolvedVar': 'var'
+ }
+ )
+
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'int': a,
+ 'intStr': b,
+ 'float': c,
+ 'floatStr': d,
+ }''',
+ {
+ 'a': '123',
+ 'b': '"123"',
+ 'c': '1.23',
+ 'd': '"1.23"',
+ }
+ )),
+ {
+ 'int': 123,
+ 'intStr': '123',
+ 'float': 1.23,
+ 'floatStr': '1.23',
+ }
+ )
+
+ self.assertDictEqual(
+ json.loads(js_to_json(
+ '''{
+ 'object': a,
+ 'objectStr': b,
+ 'array': c,
+ 'arrayStr': d,
+ }''',
+ {
+ 'a': '{}',
+ 'b': '"{}"',
+ 'c': '[]',
+ 'd': '"[]"',
+ }
+ )),
+ {
+ 'object': {},
+ 'objectStr': '{}',
+ 'array': [],
+ 'arrayStr': '[]',
+ }
+ )
+
def test_js_to_json_realworld(self):
inp = '''{
'clip':{'provider':'pseudo'}
@@ -975,10 +1054,10 @@ class TestUtil(unittest.TestCase):
!42: 42
}''')
self.assertEqual(json.loads(on), {
- 'a': 0,
- 'b': 1,
- 'c': 0,
- 'd': 42.42,
+ 'a': True,
+ 'b': False,
+ 'c': False,
+ 'd': True,
'e': [],
'f': "abc",
'g': "",
@@ -1048,10 +1127,26 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
+ on = js_to_json('[1,//{},\n2]')
+ self.assertEqual(json.loads(on), [1, 2])
+
+ on = js_to_json(r'"\^\$\#"')
+ self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped')
+
+ on = js_to_json('\'"\\""\'')
+ self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
+
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
+ def test_js_to_json_template_literal(self):
+ self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
+ self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
+ self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
+ self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
+ self.assertEqual(js_to_json('`${name}`', {}), '"name"')
+
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
@@ -4365,46 +4365,108 @@ def strip_jsonp(code):
r'\g<callback_data>', code)
-def js_to_json(code):
- COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
+def js_to_json(code, *args, **kwargs):
+
+ # vars is a dict of (var, val) pairs to substitute
+ vars = args[0] if len(args) > 0 else kwargs.get('vars', {})
+ strict = kwargs.get('strict', False)
+
+ STRING_QUOTES = '\'"`'
+ STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES)
+ COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
INTEGER_TABLE = (
(r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
(r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
+ (r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10),
)
+ # compat candidate
+ JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError
+
+ def process_escape(match):
+ JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu'
+ escape = match.group(1) or match.group(2)
+
+ return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES
+ else '\\u00' if escape == 'x'
+ else '' if escape == '\n'
+ else escape)
+
+ def template_substitute(match):
+ evaluated = js_to_json(match.group(1), vars, strict=strict)
+ if evaluated[0] == '"':
+ return json.loads(evaluated)
+ return evaluated
def fix_kv(m):
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
- elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
- return ""
-
- if v[0] in ("'", '"'):
- v = re.sub(r'(?s)\\.|"', lambda m: {
- '"': '\\"',
- "\\'": "'",
- '\\\n': '',
- '\\x': '\\u00',
- }.get(m.group(0), m.group(0)), v[1:-1])
- else:
- for regex, base in INTEGER_TABLE:
- im = re.match(regex, v)
- if im:
- i = int(im.group(1), base)
- return '"%d":' % i if v.endswith(':') else '%d' % i
+ elif v in ('undefined', 'void 0'):
+ return 'null'
+ elif v.startswith('/*') or v.startswith('//') or v == ',':
+ return ''
+
+ if v[0] in STRING_QUOTES:
+ v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
+ escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
+ return '"{0}"'.format(escaped)
+
+ inv = IDENTITY
+ im = re.split(r'^!+', v)
+ if len(im) > 1 and not im[-1].endswith(':'):
+ if (len(v) - len(im[1])) % 2 == 1:
+ inv = lambda x: 'true' if x == 0 else 'false'
+ else:
+ inv = lambda x: 'false' if x == 0 else 'true'
+ if not any(x for x in im):
+ return
+ v = im[-1]
+
+ for regex, base in INTEGER_TABLE:
+ im = re.match(regex, v)
+ if im:
+ i = int(im.group(1), base)
+ return ('"%s":' if v.endswith(':') else '%s') % inv(i)
+
+ if v in vars:
+ try:
+ if not strict:
+ json.loads(vars[v])
+ except JSONDecodeError:
+ return inv(json.dumps(vars[v]))
+ else:
+ return inv(vars[v])
+
+ if not strict:
+ v = try_call(inv, args=(v,), default=v)
+ if v in ('true', 'false'):
+ return v
+ return '"{0}"'.format(v)
+
+ raise ValueError('Unknown value: ' + v)
+
+ def create_map(mobj):
+ return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
- return '"%s"' % v
+ code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
+ if not strict:
+ code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
+ code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
+ code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
+ code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
return re.sub(r'''(?sx)
- "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
- '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
- {comment}|,(?={skip}[\]}}])|
- (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
- \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
- [0-9]+(?={skip}:)|
+ {str_}|
+ {comment}|
+ ,(?={skip}[\]}}])|
+ void\s0|
+ !*(?:(?<!\d)[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
+ (?:\b|!+)0(?:[xX][\da-fA-F]+|[0-7]+)(?:{skip}:)?|
+ !+\d+(?:\.\d*)?(?:{skip}:)?|
+ [0-9]+(?:{skip}:)|
!+
- '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+ '''.format(comment=COMMENT_RE, skip=SKIP_RE, str_=STRING_RE), fix_kv, code)
def qualities(quality_ids):