commit: 78fb87b2837e15124b5855734a951598dfe025fe
parent ab2d524780736249c8988313db021e83642c24d1
Author: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Date: Fri, 15 Nov 2013 12:54:13 +0100
Don't accept '>' inside the content attribute in OpenGraph regexes
Diffstat:
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
@@ -316,10 +316,12 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- esc_prop = re.escape(prop)
+ content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+ property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+ template = r'<meta[^>]+?%s[^>]+?%s'
return [
- r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop,
- r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop,
+ template % (property_re, content_re),
+ template % (content_re, property_re),
]
def _og_search_property(self, prop, html, name=None, **kargs):