Merge pull request #1444 from Venca24/devel_google_videos

[fix] google videos engine
author: Noémi Ványi <kvch@users.noreply.github.com> 2019-01-05 18:08:05 +0100
committer: GitHub <noreply@github.com> 2019-01-05 18:08:05 +0100
commit: abcbcec0b5ab9a3108ccc972876fdb60c7911e7a (patch)
tree: 65997b07ea9b2292662f9d5453c6fd388d10845a
parent: 899ba5d6dee82faacb572b4d9bc4c58570628531 (diff)
parent: 2456b8f57199b0479b063fa3dfb16a585c6a40ed (diff)
download: searxng-abcbcec0b5ab9a3108ccc972876fdb60c7911e7a.tar.gz
searxng-abcbcec0b5ab9a3108ccc972876fdb60c7911e7a.zip
2 files changed, 41 insertions, 10 deletions
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index 310b31490..9a41b2dfa 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -7,7 +7,7 @@
  @using-api   no
  @results     HTML
  @stable      no
- @parse       url, title, content
+ @parse       url, title, content, thumbnail
 """
 
 from datetime import date, timedelta
@@ -15,7 +15,7 @@ from json import loads
 from lxml import html
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
-
+import re
 
 # engine dependent config
 categories = ['videos']
@@ -25,7 +25,7 @@ time_range_support = True
 number_of_results = 10
 
 search_url = 'https://www.google.com/search'\
-    '?{query}'\
+    '?q={query}'\
     '&tbm=vid'\
     '&{search_options}'
 time_range_attr = "qdr:{range}"
@@ -69,15 +69,27 @@ def response(resp):
     # parse results
     for result in dom.xpath('//div[@class="g"]'):
 
-        title = extract_text(result.xpath('.//h3/a'))
-        url = result.xpath('.//h3/a/@href')[0]
+        title = extract_text(result.xpath('.//h3'))
+        url = result.xpath('.//div[@class="r"]/a/@href')[0]
         content = extract_text(result.xpath('.//span[@class="st"]'))
 
+        # get thumbnails
+        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
+        id = result.xpath('.//div[@class="s"]//img/@id')[0]
+        thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
+                                     script)
+        tmp = []
+        if len(thumbnails_data) != 0:
+            tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
+        thumbnail = ''
+        if len(tmp) != 0:
+            thumbnail = tmp[-1]
+
         # append result
         results.append({'url': url,
                         'title': title,
                         'content': content,
-                        'thumbnail': '',
+                        'thumbnail': thumbnail,
                         'template': 'videos.html'})
 
     return results
diff --git a/tests/unit/engines/test_google_videos.py b/tests/unit/engines/test_google_videos.py
index a48e9a755..3b7edf373 100644
--- a/tests/unit/engines/test_google_videos.py
+++ b/tests/unit/engines/test_google_videos.py
@@ -30,16 +30,34 @@ class TestGoogleVideosEngine(SearxTestCase):
         <div>
             <div>
                 <div class="g">
-                    <div>
-                        <h3><a href="url_1">Title 1</h3>
+                    <div class="r">
+                        <a href="url_1"><h3>Title 1</h3></a>
+                    </div>
+                    <div class="s">
+                        <div>
+                            <a>
+                                <g-img>
+                                    <img id="vidthumb1">
+                                </g-img>
+                            </a>
+                        </div>
                     </div>
                     <div>
                         <span class="st">Content 1</span>
                     </div>
                 </div>
                 <div class="g">
-                    <div>
-                        <h3><a href="url_2">Title 2</h3>
+                    <div class="r">
+                        <a href="url_2"><h3>Title 2</h3></a>
+                    </div>
+                    <div class="s">
+                        <div>
+                            <a>
+                                <g-img>
+                                    <img id="vidthumb2">
+                                </g-img>
+                            </a>
+                        </div>
                     </div>
                     <div>
                         <span class="st">Content 2</span>
@@ -47,6 +65,7 @@ class TestGoogleVideosEngine(SearxTestCase):
                 </div>
             </div>
         </div>
+        <script>function _setImagesSrc(c,d,e){}</script>
         """
         response = mock.Mock(text=html)
         results = google_videos.response(response)
author	Noémi Ványi <kvch@users.noreply.github.com>	2019-01-05 18:08:05 +0100
committer	GitHub <noreply@github.com>	2019-01-05 18:08:05 +0100
commit	abcbcec0b5ab9a3108ccc972876fdb60c7911e7a (patch)
tree	65997b07ea9b2292662f9d5453c6fd388d10845a
parent	899ba5d6dee82faacb572b4d9bc4c58570628531 (diff)
parent	2456b8f57199b0479b063fa3dfb16a585c6a40ed (diff)
download	searxng-abcbcec0b5ab9a3108ccc972876fdb60c7911e7a.tar.gz searxng-abcbcec0b5ab9a3108ccc972876fdb60c7911e7a.zip