[mod] improve engine startpage to reduce the frequency of CAPTCHA

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-08-16 17:59:40 +02:00
parent 9100a48541
commit 79c499d145

View file

@ -83,6 +83,7 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from collections import OrderedDict from collections import OrderedDict
import re import re
from urllib.parse import urlencode
from unicodedata import normalize, combining from unicodedata import normalize, combining
from time import time from time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -161,7 +162,7 @@ search_form_xpath = '//form[@id="search"]'
# timestamp of the last fetch of 'sc' code # timestamp of the last fetch of 'sc' code
sc_code_ts = 0 sc_code_ts = 0
sc_code = '' sc_code = ''
sc_code_cache_sec = 30 sc_code_cache_sec = 3600
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" """Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
@ -275,42 +276,46 @@ def _request_cat_web(query, params):
args['language'] = engine_language args['language'] = engine_language
args['lui'] = engine_language args['lui'] = engine_language
args['abp'] = '1' # args['abp'] = '1'
if params['pageno'] > 1: if params['pageno'] > 1:
args['page'] = params['pageno'] args['page'] = params['pageno']
# build cookie # build cookie
lang_homepage = 'en' lang_homepage = 'en'
cookie = OrderedDict() cookie = OrderedDict()
cookie['connect_to_server'] = 'us'
cookie['date_time'] = 'world' cookie['date_time'] = 'world'
cookie['disable_family_filter'] = safesearch_dict[params['safesearch']] cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
cookie['disable_open_in_new_window'] = '0' cookie['disable_open_in_new_window'] = '0'
cookie['enable_post_method'] = '1' # hint: POST cookie['enable_post_method'] = '0' # hint: GET
cookie['enable_proxy_safety_suggest'] = '1' cookie['enable_proxy_safety_suggest'] = '1'
cookie['enable_stay_control'] = '1' cookie['enable_stay_control'] = '1'
cookie['instant_answers'] = '1' cookie['instant_answers'] = '1'
cookie['lang_homepage'] = 's/device/%s/' % lang_homepage cookie['lang_homepage'] = 's/device/%s' % lang_homepage
cookie['num_of_results'] = '10'
cookie['suggestions'] = '1'
cookie['wt_unit'] = 'celsius'
if engine_language: if engine_language:
cookie['language'] = engine_language cookie['language'] = engine_language
cookie['language_ui'] = engine_language cookie['language_ui'] = engine_language
cookie['num_of_results'] = '10'
if engine_region: if engine_region:
cookie['search_results_region'] = engine_region cookie['search_results_region'] = engine_region
cookie['suggestions'] = '1'
cookie['wt_unit'] = 'celsius'
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences']) logger.debug('cookie preferences: %s', params['cookies']['preferences'])
# GET request
params['method'] = 'GET'
# https://www.startpage.com/do/search?sc=CmEL6wNu8t5j20&query=foo&cat=web&qloc=eyJsYXQiOiBudWxsLCAibG5nIjogbnVsbCwgInR5cGUiOiAibm9uZSJ9
params['url'] = search_url + '?' + urlencode(args)
# POST request # POST request
logger.debug("data: %s", args) # logger.debug("data: %s", args)
params['data'] = args # params['data'] = args
params['method'] = 'POST' # params['method'] = 'GET'
params['url'] = search_url # params['url'] = search_url
params['headers']['Origin'] = base_url # params['headers']['Origin'] = base_url
params['headers']['Referer'] = base_url + '/' # params['headers']['Referer'] = base_url + '/'
# is the Accept header needed? # is the Accept header needed?
# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'