yt-dlp/yt_dlp/extractor/googlesearch.py

from __future__ import unicode_literals

import itertools
import re

from .common import SearchInfoExtractor


class GoogleSearchIE(SearchInfoExtractor):
    IE_DESC = 'Google Video search'
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'
    _TESTS = [{
        'url': 'gvsearch15:python language',
        'info_dict': {
            'id': 'python language',
            'title': 'python language',
        },
        'playlist_count': 15,
    }]
    _PAGE_SIZE = 100

    def _search_results(self, query):
        for pagenum in itertools.count():
            webpage = self._download_webpage(
                'http://www.google.com/search', f'gvsearch:{query}',
                note=f'Downloading result page {pagenum + 1}',
                query={
                    'tbm': 'vid',
                    'q': query,
                    'start': pagenum * self._PAGE_SIZE,
                    'num': self._PAGE_SIZE,
                    'hl': 'en',
                })

            for url in re.findall(r'<div[^>]* class="dXiKIc"[^>]*><a href="([^"]+)"', webpage):
                yield self.url_result(url)

            if not re.search(r'id="pnnext"', webpage):
                return
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-05 21:29:10 -05:00			`from __future__ import unicode_literals`

Move GoogleSearchIE into its own file 2013-06-23 14:32:49 -04:00			`import itertools`
			`import re`

			`from .common import SearchInfoExtractor`


			`class GoogleSearchIE(SearchInfoExtractor):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-05 21:29:10 -05:00			`IE_DESC = 'Google Video search'`
			`IE_NAME = 'video.google:search'`
Move GoogleSearchIE into its own file 2013-06-23 14:32:49 -04:00			`_SEARCH_KEY = 'gvsearch'`
[GoogleSearch] Fix extractor 2022-01-31 02:02:44 -05:00			`_TESTS = [{`
[googlesearch] Move test to extractor 2014-08-25 11:02:52 -04:00			`'url': 'gvsearch15:python language',`
			`'info_dict': {`
			`'id': 'python language',`
			`'title': 'python language',`
			`},`
			`'playlist_count': 15,`
[GoogleSearch] Fix extractor 2022-01-31 02:02:44 -05:00			`}]`
			`_PAGE_SIZE = 100`
Move GoogleSearchIE into its own file 2013-06-23 14:32:49 -04:00
[extractor] Simplify search extractors 2021-10-08 16:39:55 -04:00			`def _search_results(self, query):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-05 21:29:10 -05:00			`for pagenum in itertools.count():`
			`webpage = self._download_webpage(`
[GoogleSearch] Fix extractor 2022-01-31 02:02:44 -05:00			`'http://www.google.com/search', f'gvsearch:{query}',`
			`note=f'Downloading result page {pagenum + 1}',`
[gvsearch] Modernize and fix page result request (closes #11051) 2016-10-28 12:19:59 -04:00			`query={`
			`'tbm': 'vid',`
			`'q': query,`
[GoogleSearch] Fix extractor 2022-01-31 02:02:44 -05:00			`'start': pagenum * self._PAGE_SIZE,`
			`'num': self._PAGE_SIZE,`
[gvsearch] Modernize and fix page result request (closes #11051) 2016-10-28 12:19:59 -04:00			`'hl': 'en',`
			`})`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-05 21:29:10 -05:00
[GoogleSearch] Fix extractor 2022-01-31 02:02:44 -05:00			`for url in re.findall(r'<div[^>]* class="dXiKIc"[^>]*><a href="([^"]+)"', webpage):`
			`yield self.url_result(url)`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-05 21:29:10 -05:00
[extractor] Simplify search extractors 2021-10-08 16:39:55 -04:00			`if not re.search(r'id="pnnext"', webpage):`
			`return`