Add experimental geo restriction bypass mechanism

Based on faking X-Forwarded-For HTTP header
This commit is contained in:
Sergey M․ 2017-02-04 18:49:58 +07:00 committed by Sergey M
parent bf5b9d859a
commit 773f291dcb
5 changed files with 340 additions and 6 deletions

View file

@ -56,6 +56,8 @@
ExtractorError, ExtractorError,
format_bytes, format_bytes,
formatSeconds, formatSeconds,
GeoRestrictedError,
ISO3166Utils,
locked_file, locked_file,
make_HTTPS_handler, make_HTTPS_handler,
MaxDownloadsReached, MaxDownloadsReached,
@ -272,6 +274,13 @@ class YoutubeDL(object):
If it returns None, the video is downloaded. If it returns None, the video is downloaded.
match_filter_func in utils.py is one example for this. match_filter_func in utils.py is one example for this.
no_color: Do not emit color codes in output. no_color: Do not emit color codes in output.
bypass_geo_restriction:
Bypass geographic restriction via faking X-Forwarded-For
HTTP header (experimental)
bypass_geo_restriction_as_country:
Two-letter ISO 3166-2 country code that will be used for
explicit geographic restriction bypassing via faking
X-Forwarded-For HTTP header (experimental)
The following options determine which downloader is picked: The following options determine which downloader is picked:
external_downloader: Executable of the external downloader to call. external_downloader: Executable of the external downloader to call.
@ -707,6 +716,14 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={},
return self.process_ie_result(ie_result, download, extra_info) return self.process_ie_result(ie_result, download, extra_info)
else: else:
return ie_result return ie_result
except GeoRestrictedError as e:
msg = e.msg
if e.countries:
msg += '\nThis video is available in %s.' % ', '.join(
map(ISO3166Utils.short2full, e.countries))
msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
self.report_error(msg)
break
except ExtractorError as e: # An error we somewhat expected except ExtractorError as e: # An error we somewhat expected
self.report_error(compat_str(e), e.format_traceback()) self.report_error(compat_str(e), e.format_traceback())
break break

View file

@ -414,6 +414,8 @@ def parse_retries(retries):
'cn_verification_proxy': opts.cn_verification_proxy, 'cn_verification_proxy': opts.cn_verification_proxy,
'geo_verification_proxy': opts.geo_verification_proxy, 'geo_verification_proxy': opts.geo_verification_proxy,
'config_location': opts.config_location, 'config_location': opts.config_location,
'bypass_geo_restriction': opts.bypass_geo_restriction,
'bypass_geo_restriction_as_country': opts.bypass_geo_restriction_as_country,
} }
with YoutubeDL(ydl_opts) as ydl: with YoutubeDL(ydl_opts) as ydl:

View file

@ -6,6 +6,7 @@
import json import json
import netrc import netrc
import os import os
import random
import re import re
import socket import socket
import sys import sys
@ -39,6 +40,8 @@
ExtractorError, ExtractorError,
fix_xml_ampersands, fix_xml_ampersands,
float_or_none, float_or_none,
GeoRestrictedError,
GeoUtils,
int_or_none, int_or_none,
js_to_json, js_to_json,
parse_iso8601, parse_iso8601,
@ -320,17 +323,25 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp. _real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors. Probably, they should also be added to the list of extractors.
_BYPASS_GEO attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
country code provided with bypass_geo_restriction_as_country.
Finally, the _WORKING attribute should be set to False for broken IEs Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests. in order to warn the users and skip the tests.
""" """
_ready = False _ready = False
_downloader = None _downloader = None
_x_forwarded_for_ip = None
_BYPASS_GEO = True
_WORKING = True _WORKING = True
def __init__(self, downloader=None): def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader.""" """Constructor. Receives an optional downloader."""
self._ready = False self._ready = False
self._x_forwarded_for_ip = None
self.set_downloader(downloader) self.set_downloader(downloader)
@classmethod @classmethod
@ -359,6 +370,10 @@ def working(cls):
def initialize(self): def initialize(self):
"""Initializes an instance (authentication, etc).""" """Initializes an instance (authentication, etc)."""
if not self._x_forwarded_for_ip:
country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None)
if country_code:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if not self._ready: if not self._ready:
self._real_initialize() self._real_initialize()
self._ready = True self._ready = True
@ -366,8 +381,22 @@ def initialize(self):
def extract(self, url): def extract(self, url):
"""Extracts URL information and returns it in list of dicts.""" """Extracts URL information and returns it in list of dicts."""
try: try:
self.initialize() for _ in range(2):
return self._real_extract(url) try:
self.initialize()
return self._real_extract(url)
except GeoRestrictedError as e:
if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and
self._BYPASS_GEO and
self._downloader.params.get('bypass_geo_restriction', True) and
not self._x_forwarded_for_ip and
e.countries):
self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
if self._x_forwarded_for_ip:
self.report_warning(
'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
continue
raise
except ExtractorError: except ExtractorError:
raise raise
except compat_http_client.IncompleteRead as e: except compat_http_client.IncompleteRead as e:
@ -434,6 +463,15 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0] url_or_request = url_or_request.partition('#')[0]
# Some sites check X-Forwarded-For HTTP header in order to figure out
# the origin of the client behind proxy. This allows bypassing geo
# restriction by faking this header's value to IP that belongs to some
# geo unrestricted country. We will do so once we encounter any
# geo restriction error.
if self._x_forwarded_for_ip:
if 'X-Forwarded-For' not in headers:
headers['X-Forwarded-For'] = self._x_forwarded_for_ip
urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
if urlh is False: if urlh is False:
assert not fatal assert not fatal
@ -609,10 +647,8 @@ def raise_login_required(msg='This video is only available for registered users'
expected=True) expected=True)
@staticmethod @staticmethod
def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
raise ExtractorError( raise GeoRestrictedError(msg, countries=countries)
'%s. You might want to use --proxy to workaround.' % msg,
expected=True)
# Methods for following #608 # Methods for following #608
@staticmethod @staticmethod

View file

@ -549,6 +549,18 @@ def _scrub_eq(o):
'Upper bound of a range for randomized sleep before each download ' 'Upper bound of a range for randomized sleep before each download '
'(maximum possible number of seconds to sleep). Must only be used ' '(maximum possible number of seconds to sleep). Must only be used '
'along with --min-sleep-interval.')) 'along with --min-sleep-interval.'))
workarounds.add_option(
'--bypass-geo',
action='store_true', dest='bypass_geo_restriction', default=True,
help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)')
workarounds.add_option(
'--no-bypass-geo',
action='store_false', dest='bypass_geo_restriction', default=True,
help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)')
workarounds.add_option(
'--bypass-geo-as-country', metavar='CODE',
dest='bypass_geo_restriction_as_country', default=None,
help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)')
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
verbosity.add_option( verbosity.add_option(

View file

@ -23,6 +23,7 @@
import os import os
import pipes import pipes
import platform import platform
import random
import re import re
import socket import socket
import ssl import ssl
@ -747,6 +748,18 @@ class RegexNotFoundError(ExtractorError):
pass pass
class GeoRestrictedError(ExtractorError):
"""Geographic restriction Error exception.
This exception may be thrown when a video is not available from your
geographic location due to geographic restrictions imposed by a website.
"""
def __init__(self, msg, countries=None):
super(GeoRestrictedError, self).__init__(msg, expected=True)
self.msg = msg
self.countries = countries
class DownloadError(YoutubeDLError): class DownloadError(YoutubeDLError):
"""Download Error exception. """Download Error exception.
@ -3027,6 +3040,260 @@ def short2full(cls, code):
return cls._country_map.get(code.upper()) return cls._country_map.get(code.upper())
class GeoUtils(object):
# Major IPv4 address blocks per country
_country_ip_map = {
'AD': '85.94.160.0/19',
'AE': '94.200.0.0/13',
'AF': '149.54.0.0/17',
'AG': '209.59.64.0/18',
'AI': '204.14.248.0/21',
'AL': '46.99.0.0/16',
'AM': '46.70.0.0/15',
'AO': '105.168.0.0/13',
'AP': '159.117.192.0/21',
'AR': '181.0.0.0/12',
'AS': '202.70.112.0/20',
'AT': '84.112.0.0/13',
'AU': '1.128.0.0/11',
'AW': '181.41.0.0/18',
'AZ': '5.191.0.0/16',
'BA': '31.176.128.0/17',
'BB': '65.48.128.0/17',
'BD': '114.130.0.0/16',
'BE': '57.0.0.0/8',
'BF': '129.45.128.0/17',
'BG': '95.42.0.0/15',
'BH': '37.131.0.0/17',
'BI': '154.117.192.0/18',
'BJ': '137.255.0.0/16',
'BL': '192.131.134.0/24',
'BM': '196.12.64.0/18',
'BN': '156.31.0.0/16',
'BO': '161.56.0.0/16',
'BQ': '161.0.80.0/20',
'BR': '152.240.0.0/12',
'BS': '24.51.64.0/18',
'BT': '119.2.96.0/19',
'BW': '168.167.0.0/16',
'BY': '178.120.0.0/13',
'BZ': '179.42.192.0/18',
'CA': '99.224.0.0/11',
'CD': '41.243.0.0/16',
'CF': '196.32.200.0/21',
'CG': '197.214.128.0/17',
'CH': '85.0.0.0/13',
'CI': '154.232.0.0/14',
'CK': '202.65.32.0/19',
'CL': '152.172.0.0/14',
'CM': '165.210.0.0/15',
'CN': '36.128.0.0/10',
'CO': '181.240.0.0/12',
'CR': '201.192.0.0/12',
'CU': '152.206.0.0/15',
'CV': '165.90.96.0/19',
'CW': '190.88.128.0/17',
'CY': '46.198.0.0/15',
'CZ': '88.100.0.0/14',
'DE': '53.0.0.0/8',
'DJ': '197.241.0.0/17',
'DK': '87.48.0.0/12',
'DM': '192.243.48.0/20',
'DO': '152.166.0.0/15',
'DZ': '41.96.0.0/12',
'EC': '186.68.0.0/15',
'EE': '90.190.0.0/15',
'EG': '156.160.0.0/11',
'ER': '196.200.96.0/20',
'ES': '88.0.0.0/11',
'ET': '196.188.0.0/14',
'EU': '2.16.0.0/13',
'FI': '91.152.0.0/13',
'FJ': '144.120.0.0/16',
'FM': '119.252.112.0/20',
'FO': '88.85.32.0/19',
'FR': '90.0.0.0/9',
'GA': '41.158.0.0/15',
'GB': '25.0.0.0/8',
'GD': '74.122.88.0/21',
'GE': '31.146.0.0/16',
'GF': '161.22.64.0/18',
'GG': '62.68.160.0/19',
'GH': '45.208.0.0/14',
'GI': '85.115.128.0/19',
'GL': '88.83.0.0/19',
'GM': '160.182.0.0/15',
'GN': '197.149.192.0/18',
'GP': '104.250.0.0/19',
'GQ': '105.235.224.0/20',
'GR': '94.64.0.0/13',
'GT': '168.234.0.0/16',
'GU': '168.123.0.0/16',
'GW': '197.214.80.0/20',
'GY': '181.41.64.0/18',
'HK': '113.252.0.0/14',
'HN': '181.210.0.0/16',
'HR': '93.136.0.0/13',
'HT': '148.102.128.0/17',
'HU': '84.0.0.0/14',
'ID': '39.192.0.0/10',
'IE': '87.32.0.0/12',
'IL': '79.176.0.0/13',
'IM': '5.62.80.0/20',
'IN': '117.192.0.0/10',
'IO': '203.83.48.0/21',
'IQ': '37.236.0.0/14',
'IR': '2.176.0.0/12',
'IS': '82.221.0.0/16',
'IT': '79.0.0.0/10',
'JE': '87.244.64.0/18',
'JM': '72.27.0.0/17',
'JO': '176.29.0.0/16',
'JP': '126.0.0.0/8',
'KE': '105.48.0.0/12',
'KG': '158.181.128.0/17',
'KH': '36.37.128.0/17',
'KI': '103.25.140.0/22',
'KM': '197.255.224.0/20',
'KN': '198.32.32.0/19',
'KP': '175.45.176.0/22',
'KR': '175.192.0.0/10',
'KW': '37.36.0.0/14',
'KY': '64.96.0.0/15',
'KZ': '2.72.0.0/13',
'LA': '115.84.64.0/18',
'LB': '178.135.0.0/16',
'LC': '192.147.231.0/24',
'LI': '82.117.0.0/19',
'LK': '112.134.0.0/15',
'LR': '41.86.0.0/19',
'LS': '129.232.0.0/17',
'LT': '78.56.0.0/13',
'LU': '188.42.0.0/16',
'LV': '46.109.0.0/16',
'LY': '41.252.0.0/14',
'MA': '105.128.0.0/11',
'MC': '88.209.64.0/18',
'MD': '37.246.0.0/16',
'ME': '178.175.0.0/17',
'MF': '74.112.232.0/21',
'MG': '154.126.0.0/17',
'MH': '117.103.88.0/21',
'MK': '77.28.0.0/15',
'ML': '154.118.128.0/18',
'MM': '37.111.0.0/17',
'MN': '49.0.128.0/17',
'MO': '60.246.0.0/16',
'MP': '202.88.64.0/20',
'MQ': '109.203.224.0/19',
'MR': '41.188.64.0/18',
'MS': '208.90.112.0/22',
'MT': '46.11.0.0/16',
'MU': '105.16.0.0/12',
'MV': '27.114.128.0/18',
'MW': '105.234.0.0/16',
'MX': '187.192.0.0/11',
'MY': '175.136.0.0/13',
'MZ': '197.218.0.0/15',
'NA': '41.182.0.0/16',
'NC': '101.101.0.0/18',
'NE': '197.214.0.0/18',
'NF': '203.17.240.0/22',
'NG': '105.112.0.0/12',
'NI': '186.76.0.0/15',
'NL': '145.96.0.0/11',
'NO': '84.208.0.0/13',
'NP': '36.252.0.0/15',
'NR': '203.98.224.0/19',
'NU': '49.156.48.0/22',
'NZ': '49.224.0.0/14',
'OM': '5.36.0.0/15',
'PA': '186.72.0.0/15',
'PE': '186.160.0.0/14',
'PF': '123.50.64.0/18',
'PG': '124.240.192.0/19',
'PH': '49.144.0.0/13',
'PK': '39.32.0.0/11',
'PL': '83.0.0.0/11',
'PM': '70.36.0.0/20',
'PR': '66.50.0.0/16',
'PS': '188.161.0.0/16',
'PT': '85.240.0.0/13',
'PW': '202.124.224.0/20',
'PY': '181.120.0.0/14',
'QA': '37.210.0.0/15',
'RE': '139.26.0.0/16',
'RO': '79.112.0.0/13',
'RS': '178.220.0.0/14',
'RU': '5.136.0.0/13',
'RW': '105.178.0.0/15',
'SA': '188.48.0.0/13',
'SB': '202.1.160.0/19',
'SC': '154.192.0.0/11',
'SD': '154.96.0.0/13',
'SE': '78.64.0.0/12',
'SG': '152.56.0.0/14',
'SI': '188.196.0.0/14',
'SK': '78.98.0.0/15',
'SL': '197.215.0.0/17',
'SM': '89.186.32.0/19',
'SN': '41.82.0.0/15',
'SO': '197.220.64.0/19',
'SR': '186.179.128.0/17',
'SS': '105.235.208.0/21',
'ST': '197.159.160.0/19',
'SV': '168.243.0.0/16',
'SX': '190.102.0.0/20',
'SY': '5.0.0.0/16',
'SZ': '41.84.224.0/19',
'TC': '65.255.48.0/20',
'TD': '154.68.128.0/19',
'TG': '196.168.0.0/14',
'TH': '171.96.0.0/13',
'TJ': '85.9.128.0/18',
'TK': '27.96.24.0/21',
'TL': '180.189.160.0/20',
'TM': '95.85.96.0/19',
'TN': '197.0.0.0/11',
'TO': '175.176.144.0/21',
'TR': '78.160.0.0/11',
'TT': '186.44.0.0/15',
'TV': '202.2.96.0/19',
'TW': '120.96.0.0/11',
'TZ': '156.156.0.0/14',
'UA': '93.72.0.0/13',
'UG': '154.224.0.0/13',
'US': '3.0.0.0/8',
'UY': '167.56.0.0/13',
'UZ': '82.215.64.0/18',
'VA': '212.77.0.0/19',
'VC': '24.92.144.0/20',
'VE': '186.88.0.0/13',
'VG': '172.103.64.0/18',
'VI': '146.226.0.0/16',
'VN': '14.160.0.0/11',
'VU': '202.80.32.0/20',
'WF': '117.20.32.0/21',
'WS': '202.4.32.0/19',
'YE': '134.35.0.0/16',
'YT': '41.242.116.0/22',
'ZA': '41.0.0.0/11',
'ZM': '165.56.0.0/13',
'ZW': '41.85.192.0/19',
}
@classmethod
def random_ipv4(cls, code):
block = cls._country_ip_map.get(code.upper())
if not block:
return None
addr, preflen = block.split('/')
addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
addr_max = addr_min | (0xffffffff >> int(preflen))
return socket.inet_ntoa(
compat_struct_pack('!I', random.randint(addr_min, addr_max)))
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None): def __init__(self, proxies=None):
# Set default handlers # Set default handlers