From 024df5b80755c571f2f72c4e88e941f4813a8d40 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 02:29:47 +0430 Subject: [PATCH 01/21] added offset & query. changed way of url making. now it can search in a normal way by using -q flag, it's like normal searching in google scholar. changed way of url making, because it was using useless arguments in url, and google i was getting ban from google because of it. now supports offset by using -o flag, it's how many articles that you want skip. --- scholar.py | 125 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/scholar.py b/scholar.py index 13ccd43..0b8aee5 100755 --- a/scholar.py +++ b/scholar.py @@ -165,6 +165,7 @@ import os import re import sys +from typing import OrderedDict import warnings try: @@ -745,24 +746,32 @@ class SearchScholarQuery(ScholarQuery): This version represents the search query parameters the user can configure on the Scholar website, in the advanced search options. """ - SCHOLAR_QUERY_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \ - + 'as_q=%(words)s' \ - + '&as_epq=%(phrase)s' \ - + '&as_oq=%(words_some)s' \ - + '&as_eq=%(words_none)s' \ - + '&as_occt=%(scope)s' \ - + '&as_sauthors=%(authors)s' \ - + '&as_publication=%(pub)s' \ - + '&as_ylo=%(ylo)s' \ - + '&as_yhi=%(yhi)s' \ - + '&as_vis=%(citations)s' \ - + '&btnG=&hl=en' \ - + '%(num)s' \ - + '&as_sdt=%(patents)s%%2C5' + BASE_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' + + URL_ARGS = OrderedDict({ + 'offset': 'start', + 'query': 'q', + 'words': 'as_q', + 'phrase': 'as_epq', + 'word_some': 'as_oq', + 'words_none': 'as_eq', + 'scope': 'as_occt', + 'authors': 'as_sauthors', + 'pub': 'as_publication', + 'ylo': 'as_ylo', + 'yhi': 'as_yhi', + 'citations': 'as_vis', + 'btnG': 'btnG', + 'lang': 'hl', + 'num_results': 'num', + 'patents': 'as_sdt' + }) def __init__(self): ScholarQuery.__init__(self) self._add_attribute_type('num_results', 'Results', 0) + self.offset = None + self.query = None self.words = None # The default search behavior self.words_some = None # At least one of those words self.words_none = None # None of these words @@ -771,9 +780,51 @@ def __init__(self): self.author = None self.pub = None self.timeframe = [None, None] + self.btnG = '' + self.lang = 'en' self.include_patents = True self.include_citations = True + @property + def url_query(self): + args = { + 'offset': self.offset, + 'query': self.query, + 'words': self.words, + 'phrase': self.phrase, + 'word_some': self._parenthesize_phrases(self.words_some) if self.words_some else None, + 'words_none': self._parenthesize_phrases(self.words_none) if self.words_none else None, + 'scope': self.scope_title, + 'authors': self.author, + 'pub': self.pub, + 'ylo': self.timeframe[0], + 'yhi': self.timeframe[1], + 'citations': '0' if self.include_citations else '1', + 'btnG': self.btnG, + 'lang': self.lang, + 'num_results': self.num_results, + 'patents': '%s%%2C5' % '0' if self.include_patents else '1' + } + + query = '' + + for key, val in args.items(): + if val != None: + query += '%s=%s&' % (self.URL_ARGS[key], quote(encode(val))) + + # deleting last '&' + query = query[: -1] + + return query + + def set_offset(self, offset): + """"sets offset number. it'll skip first (offset) articles in search.""" + self.offset = offset + + def set_query(self, query): + """"it's what you fill in search box.""" + self.query = query + def set_words(self, words): """Sets words that *all* must be found in the result.""" self.words = words @@ -826,43 +877,11 @@ def get_url(self): if self.words is None and self.words_some is None \ and self.words_none is None and self.phrase is None \ and self.author is None and self.pub is None \ - and self.timeframe[0] is None and self.timeframe[1] is None: + and self.timeframe[0] is None and self.timeframe[1] is None \ + and self.query is None: raise QueryArgumentError('search query needs more parameters') - # If we have some-words or none-words lists, we need to - # process them so GS understands them. For simple - # space-separeted word lists, there's nothing to do. For lists - # of phrases we have to ensure quotations around the phrases, - # separating them by whitespace. - words_some = None - words_none = None - - if self.words_some: - words_some = self._parenthesize_phrases(self.words_some) - if self.words_none: - words_none = self._parenthesize_phrases(self.words_none) - - urlargs = {'words': self.words or '', - 'words_some': words_some or '', - 'words_none': words_none or '', - 'phrase': self.phrase or '', - 'scope': 'title' if self.scope_title else 'any', - 'authors': self.author or '', - 'pub': self.pub or '', - 'ylo': self.timeframe[0] or '', - 'yhi': self.timeframe[1] or '', - 'patents': '0' if self.include_patents else '1', - 'citations': '0' if self.include_citations else '1'} - - for key, val in urlargs.items(): - urlargs[key] = quote(encode(val)) - - # The following URL arguments must not be quoted, or the - # server will not recognize them: - urlargs['num'] = ('&num=%d' % self.num_results - if self.num_results is not None else '') - - return self.SCHOLAR_QUERY_URL % urlargs + return self.BASE_URL + self.url_query class ScholarSettings(object): @@ -1165,6 +1184,10 @@ def main(): parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') + group.add_option('-q', '--query', metavar='QUERY', default=None, + help='Normal search query.') + group.add_option('-o', '--offset', metavar='OFFSET', default=None, + help='it\'ll skip first (offset) articles in search.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', @@ -1265,6 +1288,10 @@ def main(): query = ClusterScholarQuery(cluster=options.cluster_id) else: query = SearchScholarQuery() + if options.offset: + query.set_offset(options.offset) + if options.query: + query.set_query(options.query) if options.author: query.set_author(options.author) if options.allw: From c255020a4ca93b37f5edcd8a410e354d6794ac6a Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 02:47:59 +0430 Subject: [PATCH 02/21] fixed scpoe_title bug. and another bug. --- scholar.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scholar.py b/scholar.py index 0b8aee5..c64d3bc 100755 --- a/scholar.py +++ b/scholar.py @@ -794,7 +794,7 @@ def url_query(self): 'phrase': self.phrase, 'word_some': self._parenthesize_phrases(self.words_some) if self.words_some else None, 'words_none': self._parenthesize_phrases(self.words_none) if self.words_none else None, - 'scope': self.scope_title, + 'scope': 'title' if self.scope_title else 'any', 'authors': self.author, 'pub': self.pub, 'ylo': self.timeframe[0], @@ -808,9 +808,9 @@ def url_query(self): query = '' - for key, val in args.items(): - if val != None: - query += '%s=%s&' % (self.URL_ARGS[key], quote(encode(val))) + for key, val in self.URL_ARGS.items(): + if args[key] != None: + query += '%s=%s&' % (val, quote(encode(args[key]))) # deleting last '&' query = query[: -1] @@ -881,6 +881,7 @@ def get_url(self): and self.query is None: raise QueryArgumentError('search query needs more parameters') + print(self.BASE_URL + self.url_query) return self.BASE_URL + self.url_query From 83897a4226fc1d70b1b0267b97300004029f3ead Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 02:58:57 +0430 Subject: [PATCH 03/21] deleted print() in code. --- scholar.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scholar.py b/scholar.py index c64d3bc..3b352fb 100755 --- a/scholar.py +++ b/scholar.py @@ -880,8 +880,7 @@ def get_url(self): and self.timeframe[0] is None and self.timeframe[1] is None \ and self.query is None: raise QueryArgumentError('search query needs more parameters') - - print(self.BASE_URL + self.url_query) + return self.BASE_URL + self.url_query From 35e90bbc440e09c9ebc4ead9ec0e20356f48355e Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 14:18:00 +0430 Subject: [PATCH 04/21] added some comments --- scholar.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scholar.py b/scholar.py index 3b352fb..fbe8c09 100755 --- a/scholar.py +++ b/scholar.py @@ -787,6 +787,8 @@ def __init__(self): @property def url_query(self): + """this will create query to add to BASE_URL for requesting""" + args = { 'offset': self.offset, 'query': self.query, From dbe884e7517717bec5353ddaf77bdb4720728590 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 14:51:33 +0430 Subject: [PATCH 05/21] added __len__, __iadd__ to ScholarQuerier, and added max-results --- scholar.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/scholar.py b/scholar.py index fbe8c09..b47f405 100755 --- a/scholar.py +++ b/scholar.py @@ -1127,6 +1127,12 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): ScholarUtils.log('info', err_msg + ': %s' % err) return None + def __len__(self): + return len(self.articles) + + def __iadd__(self, other): + self.articles += other.articles + return self def txt(querier, with_globals): if with_globals: @@ -1214,8 +1220,12 @@ def main(): help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') - group.add_option('-c', '--count', type='int', default=None, - help='Maximum number of results') + group.add_option('-m', '--max-results', type='int', default=None, + help='Maximum number of results to get, returns all results if is bigger than all results') + group.add_option('--all-results', action='store_true', default=False, + help='get all results') + # group.add_option('-c', '--count', type='int', default=None, + # help='Maximum number of results per page') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', @@ -1315,12 +1325,20 @@ def main(): if options.no_citations: query.set_include_citations(False) - if options.count is not None: - options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) - query.set_num_page_results(options.count) + if options.max_results is not None: + # if user wants less than MAX_PAGE_RESULTS articles + # set perpage results to max_results + if options.max_results < ScholarConf.MAX_PAGE_RESULTS: + query.set_num_page_results(options.max_results) + # else: + + # options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) querier.send_query(query) + # check + + if options.csv: csv(querier) elif options.csv_header: From fb8b5af6627d9f8c917ecff63b0bf5d162db2119 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 17:00:00 +0430 Subject: [PATCH 06/21] added support of getting more than 10 results. --- scholar.py | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/scholar.py b/scholar.py index b47f405..9494c2f 100755 --- a/scholar.py +++ b/scholar.py @@ -165,6 +165,7 @@ import os import re import sys +from time import sleep from typing import OrderedDict import warnings @@ -1031,12 +1032,14 @@ def apply_settings(self, settings): ScholarUtils.log('info', 'settings applied') return True - def send_query(self, query): + def send_query(self, query, clear=True): """ This method initiates a search query (a ScholarQuery instance) with subsequent parsing of the response. """ - self.clear_articles() + if clear: + self.clear_articles() + self.query = query html = self._get_http_response(url=query.get_url(), @@ -1222,10 +1225,13 @@ def main(): help='Do not search, just use articles in given cluster ID') group.add_option('-m', '--max-results', type='int', default=None, help='Maximum number of results to get, returns all results if is bigger than all results') + group.add_option('-D', '--delay', type='float', default=2.0, + help='delay for each requests, to not get banned by google because of a DOS attack! default is 2 sec') group.add_option('--all-results', action='store_true', default=False, help='get all results') # group.add_option('-c', '--count', type='int', default=None, # help='Maximum number of results per page') + parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', @@ -1296,6 +1302,7 @@ def main(): querier.apply_settings(settings) + if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) else: @@ -1327,18 +1334,44 @@ def main(): if options.max_results is not None: # if user wants less than MAX_PAGE_RESULTS articles - # set perpage results to max_results if options.max_results < ScholarConf.MAX_PAGE_RESULTS: + # set perpage results to max_results query.set_num_page_results(options.max_results) - # else: - - # options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) querier.send_query(query) - # check + # offset is number of first articles to skip + offset = options.offset if options.offset else 0 + + # all available articles + all_results_num = query['num_results'] - offset + + # set results number to get + if options.all_results: + results_num_to_get = all_results_num + elif options.max_results: + results_num_to_get = min(options.max_results, all_results_num) + else: + results_num_to_get = len(querier) + remaining_to_get = results_num_to_get - len(querier) + + # if we didn't get enough articles get remaining articles + while remaining_to_get > 0: + sleep(options.delay) + # set offset + query.offset = offset + len(querier) + + # if remaining articles to get is less than max results per page + if remaining_to_get < ScholarConf.MAX_PAGE_RESULTS: + # then just get remaining results + query.set_num_page_results(remaining_to_get) + + querier.send_query(query, clear=False) + + remaining_to_get = results_num_to_get - len(querier) + print(len(querier)) if options.csv: csv(querier) elif options.csv_header: From 66a66595f44dbc5554fcad2e772811bd1adc5b69 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 17:51:34 +0430 Subject: [PATCH 07/21] deleted "testing prints()", write better help for query --- scholar.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scholar.py b/scholar.py index 9494c2f..2af6e51 100755 --- a/scholar.py +++ b/scholar.py @@ -1196,8 +1196,8 @@ def main(): group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-q', '--query', metavar='QUERY', default=None, - help='Normal search query.') - group.add_option('-o', '--offset', metavar='OFFSET', default=None, + help='Normal search query. if your query includes double quotes (") replace it by (\\"). and wrap your query in single quotes (\') example: \'portfolio optimization in \\"stock markets\\"\'') + group.add_option('-o', '--offset', type='int', metavar='OFFSET', default=None, help='it\'ll skip first (offset) articles in search.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') @@ -1231,7 +1231,7 @@ def main(): help='get all results') # group.add_option('-c', '--count', type='int', default=None, # help='Maximum number of results per page') - + parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', @@ -1356,9 +1356,12 @@ def main(): remaining_to_get = results_num_to_get - len(querier) + print(sys.argv) # if we didn't get enough articles get remaining articles while remaining_to_get > 0: + print(f'{len(querier)}/{remaining_to_get}') sleep(options.delay) + # set offset query.offset = offset + len(querier) @@ -1371,7 +1374,7 @@ def main(): remaining_to_get = results_num_to_get - len(querier) - print(len(querier)) + if options.csv: csv(querier) elif options.csv_header: From 981204723f3cdb42011e9c07ec5ac4b228359acc Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 18:12:37 +0430 Subject: [PATCH 08/21] fixed getting num_results in other than first pages --- scholar.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scholar.py b/scholar.py index 2af6e51..bbddf62 100755 --- a/scholar.py +++ b/scholar.py @@ -417,7 +417,8 @@ def _parse_globals(self): # raw text is a list because the body contains etc if raw_text is not None and len(raw_text) > 0: try: - num_results = raw_text[0].split()[1] + # first string after 'about ' is maximum results is founded. + num_results = raw_text[0].lower().split('about ')[1].split()[0] # num_results may now contain commas to separate # thousands, strip: num_results = num_results.replace(',', '') @@ -1196,7 +1197,7 @@ def main(): group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-q', '--query', metavar='QUERY', default=None, - help='Normal search query. if your query includes double quotes (") replace it by (\\"). and wrap your query in single quotes (\') example: \'portfolio optimization in \\"stock markets\\"\'') + help='Normal search query. if your query includes double quotes (") or single quotes (\') replace it by (\\") and (\\\'). and wrap your query in single quotes (\') example: \'portfolio\\\'s optimization in \\"stock markets\\"\'') group.add_option('-o', '--offset', type='int', metavar='OFFSET', default=None, help='it\'ll skip first (offset) articles in search.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, @@ -1356,15 +1357,13 @@ def main(): remaining_to_get = results_num_to_get - len(querier) - print(sys.argv) # if we didn't get enough articles get remaining articles while remaining_to_get > 0: - print(f'{len(querier)}/{remaining_to_get}') sleep(options.delay) # set offset query.offset = offset + len(querier) - + # if remaining articles to get is less than max results per page if remaining_to_get < ScholarConf.MAX_PAGE_RESULTS: # then just get remaining results @@ -1374,7 +1373,6 @@ def main(): remaining_to_get = results_num_to_get - len(querier) - if options.csv: csv(querier) elif options.csv_header: From 3e8a2a3ac5458132c85779ef04cc4031a5a959ff Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 18:42:30 +0430 Subject: [PATCH 09/21] break cycle if we got banned by google --- scholar.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scholar.py b/scholar.py index bbddf62..d365bba 100755 --- a/scholar.py +++ b/scholar.py @@ -1363,7 +1363,7 @@ def main(): # set offset query.offset = offset + len(querier) - + # if remaining articles to get is less than max results per page if remaining_to_get < ScholarConf.MAX_PAGE_RESULTS: # then just get remaining results @@ -1371,7 +1371,18 @@ def main(): querier.send_query(query, clear=False) + # if there's a problem in getting articles go out of cycle + # it can mean that there's no more articles to get. + # or got banned by google! + if results_num_to_get - len(querier) == remaining_to_get: + print("WARNING: there's probably a problem for getting all requested articles.") + print(f"got {len(querier)} articles out of {results_num_to_get} articles.") + print("this means we got banned by google.") + print("or maybe there was some unavailable articles.") + break + remaining_to_get = results_num_to_get - len(querier) + print(f'remaining: {remaining_to_get}') if options.csv: csv(querier) From 2cf81d19fb2064a8d018b82c84428b46080c0d55 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Mon, 5 Sep 2022 18:43:42 +0430 Subject: [PATCH 10/21] comment printing remaining articles to get --- scholar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scholar.py b/scholar.py index d365bba..9fa107d 100755 --- a/scholar.py +++ b/scholar.py @@ -1382,7 +1382,7 @@ def main(): break remaining_to_get = results_num_to_get - len(querier) - print(f'remaining: {remaining_to_get}') + # print(f'remaining: {remaining_to_get}') if options.csv: csv(querier) From e522d0d4a13a137d7d1aa5c48633ccb20868cfad Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Tue, 6 Sep 2022 13:51:07 +0430 Subject: [PATCH 11/21] initial commit. added .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2211df6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.txt From 897a55245ae2005bbd5909e57e1cce74d661f2e8 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Wed, 7 Sep 2022 13:39:27 +0430 Subject: [PATCH 12/21] fixed citation problem. by changing citation from bytes to str. --- scholar.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scholar.py b/scholar.py index 9fa107d..99b820e 100755 --- a/scholar.py +++ b/scholar.py @@ -485,7 +485,7 @@ def _parse_links(self, span): self._strip_url_arg('num', self._path2url(tag.get('href'))) if tag.getText().startswith('Import'): - self.article['url_citation'] = self._path2url(tag.get('href')) + self.article['url_citation'] = tag.get('href') @staticmethod @@ -1005,7 +1005,8 @@ def apply_settings(self, settings): # to Google. soup = SoupKitchen.make_soup(html) - tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) + tag = soup.find(name='form', attrs={'id': 'gs_bdy_frm'}) + if tag is None: ScholarUtils.log('info', 'parsing settings failed: no form') return False @@ -1026,7 +1027,7 @@ def apply_settings(self, settings): html = self._get_http_response(url=self.SET_SETTINGS_URL % urlargs, log_msg='dump of settings result HTML', - err_msg='applying setttings failed') + err_msg='applying setttings failed') if html is None: return False @@ -1069,6 +1070,10 @@ def get_citation_data(self, article): if data is None: return False + # data is + if type(data) == bytes: + data = data.decode('utf-8') + article.set_citation_data(data) return True From 34927e76b205f813c858cae27f161848e8150939 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Wed, 7 Sep 2022 19:38:56 +0430 Subject: [PATCH 13/21] fixed apply setting method and citation bug --- scholar.py | 92 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 19 deletions(-) diff --git a/scholar.py b/scholar.py index 99b820e..95f900f 100755 --- a/scholar.py +++ b/scholar.py @@ -936,16 +936,25 @@ class ScholarQuerier(object): GET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_settings?' \ + 'sciifh=1&hl=en&as_sdt=0,5' - SET_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' \ - + 'q=' \ - + '&scisig=%(scisig)s' \ - + '&inststart=0' \ - + '&as_sdt=1,5' \ - + '&as_sdtp=' \ - + '&num=%(num)s' \ - + '&scis=%(scis)s' \ - + '%(scisf)s' \ - + '&hl=en&lang=all&instq=&inst=569367360547434339&save=' + # example set setting url : + # https://scholar.google.com/scholar_setprefs?inststart=0&scisig=AAGBfm0AAAAAYxisq4fTruxOSf9qjln8EPloukoQ1EtW&xsrf=&num=10&scis=yes&scisf=4&hl=de&lang=all&instq=&boi_access=1&has_boo_access=1&has_casa_opt_in=1&save= + BASE_SETTINGS_URL = ScholarConf.SCHOLAR_SITE + '/scholar_setprefs?' + + SETTING_ARGS = OrderedDict({ + 'inststart': 'inststart', + 'scisig': 'scisig', + 'xsrf': 'xsrf', + 'num_results': 'num', + 'scis': 'scis', + 'scisf': 'scisf', + 'lang': 'hl', + 'art_lang': 'lang', + 'instq': 'instq', + 'boi_access': 'boi_access', + 'has_boo_access': 'has_boo_access', + 'has_casa_opt_in': 'has_casa_opt_in', + 'save': 'save' + }) # Older URLs: # ScholarConf.SCHOLAR_SITE + '/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on @@ -967,6 +976,21 @@ def __init__(self): self.query = None self.cjar = MozillaCookieJar() + self.inststart = '0' + self.scising = '' + self.xsrf = '' + self.num = None + self.scis = None + self.scisf = None + self.lang = 'en' + self.art_lang = 'all' + self.instq = '' + self.boi_access = 1 + self.has_boo_access = 1 + self.has_casa_opt_in = 1 + self.boi_access = 1 + self.save = '' + # If we have a cookie file, load it: if ScholarConf.COOKIE_JAR_FILE and \ os.path.exists(ScholarConf.COOKIE_JAR_FILE): @@ -980,6 +1004,37 @@ def __init__(self): self.opener = build_opener(HTTPCookieProcessor(self.cjar)) self.settings = None # Last settings object, if any + + @property + def setting_query(self): + """this will create query to add to BASE_SETTING_URL for requesting""" + + args = { + 'inststart': self.inststart, + 'scisig': self.scising, + 'xsrf': self.xsrf, + 'num_results': self.num, + 'scis': self.scis, + 'scisf': self.scisf, + 'lang': self.lang, + 'art_lang': self.art_lang, + 'instq': self.instq, + 'boi_access': self.boi_access, + 'has_boo_access': self.has_boo_access, + 'has_casa_opt_in': self.has_casa_opt_in, + 'save': self.save + } + + query = '' + + for key, val in self.SETTING_ARGS.items(): + if args[key] != None: + query += '%s=%s&' % (val, quote(encode(args[key]))) + + # deleting last '&' + query = query[: -1] + + return query def apply_settings(self, settings): """ @@ -1016,18 +1071,17 @@ def apply_settings(self, settings): ScholarUtils.log('info', 'parsing settings failed: scisig') return False - urlargs = {'scisig': tag['value'], - 'num': settings.per_page_results, - 'scis': 'no', - 'scisf': ''} + self.scising = tag['value'] + self.num = settings.per_page_results + self.scis = 'no' if settings.citform != 0: - urlargs['scis'] = 'yes' - urlargs['scisf'] = '&scisf=%d' % settings.citform + self.scis = 'yes' + self.scisf = '%d' % settings.citform - html = self._get_http_response(url=self.SET_SETTINGS_URL % urlargs, + html = self._get_http_response(url=self.BASE_SETTINGS_URL + self.setting_query, log_msg='dump of settings result HTML', - err_msg='applying setttings failed') + err_msg='applying setttings failed') if html is None: return False @@ -1070,7 +1124,7 @@ def get_citation_data(self, article): if data is None: return False - # data is + # change to str if it's bytes if type(data) == bytes: data = data.decode('utf-8') From c9110fbaaf0e6bcb658da7b08fd849f22936a5e4 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Thu, 8 Sep 2022 02:20:01 +0430 Subject: [PATCH 14/21] added bibTex parser --- scholar.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/scholar.py b/scholar.py index 95f900f..067c5db 100755 --- a/scholar.py +++ b/scholar.py @@ -304,6 +304,7 @@ def __init__(self): # The citation data in one of the standard export formats, # e.g. BibTeX. self.citation_data = None + self.citation_format = None def __getitem__(self, key): if key in self.attrs: @@ -487,6 +488,42 @@ def _parse_links(self, span): if tag.getText().startswith('Import'): self.article['url_citation'] = tag.get('href') + def _parse_bib(self, bib_text): + """it'll parse a bibTex citation information and extract it's information""" + + # check if citation data exists + if self.article.citation_data is None: + return False + + # bibTex sample: + # @article{perold1984large, + # title={Large-scale portfolio optimization}, + # author={Perold, Andre F}, + # journal={Management science}, + # volume={30}, + # number={10}, + # pages={1143--1160}, + # year={1984}, + # publisher={INFORMS} + # } + + # regexes to get any information + bib_regs = { + 'type': r'@(.*){', + 'title': r'title=\{(.*)\}', + 'journal': r'journal=\{(.*)\}', + 'volume': r'volume=\{(.*)\}', + 'number': r'number=\{(.*)\}', + 'pages': r'pages=\{(.*)\}', + 'publisher': r'publisher=\{(.*)\}' + } + + info = {} + + for key, reg in bib_regs.items(): + info[key] = re.search(reg, bib_text, re.IGNORECASE) + + return info @staticmethod def _tag_has_class(tag, klass): From 99d307c5d675c4ed4efcb33578ee3e810f4f1ea3 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sat, 10 Sep 2022 15:47:10 +0430 Subject: [PATCH 15/21] updatet .gitignore file --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 2211df6..6540f05 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.txt +*.html +*.out \ No newline at end of file From 37d9cc1eab47d9a137357c17390ce2b966cd81ea Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sat, 10 Sep 2022 15:49:07 +0430 Subject: [PATCH 16/21] updated the way of applying delay in requests --- scholar.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scholar.py b/scholar.py index 067c5db..8909cbd 100755 --- a/scholar.py +++ b/scholar.py @@ -163,6 +163,7 @@ import optparse import os +from random import randrange import re import sys from time import sleep @@ -304,7 +305,6 @@ def __init__(self): # The citation data in one of the standard export formats, # e.g. BibTeX. self.citation_data = None - self.citation_format = None def __getitem__(self, key): if key in self.attrs: @@ -1012,6 +1012,7 @@ def __init__(self): self.articles = [] self.query = None self.cjar = MozillaCookieJar() + self.delay_range = None self.inststart = '0' self.scising = '' @@ -1222,11 +1223,22 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-'*68) + # delay for not requesting too much and get banned + if self.delay_range is not None: + sleep(self.delay) + return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) return None + def set_delay(self, min_delay, max_delay): + self.delay_range = (min_delay, max_delay) + + @property + def delay(self): + return randrange(self.delay_range) + def __len__(self): return len(self.articles) @@ -1323,7 +1335,7 @@ def main(): group.add_option('-m', '--max-results', type='int', default=None, help='Maximum number of results to get, returns all results if is bigger than all results') group.add_option('-D', '--delay', type='float', default=2.0, - help='delay for each requests, to not get banned by google because of a DOS attack! default is 2 sec') + help='maximum delay for each requests (it\'ll be from 0 to maximum-delay seconds), to not get banned by google because of a DOS attack! default is 2 sec') group.add_option('--all-results', action='store_true', default=False, help='get all results') # group.add_option('-c', '--count', type='int', default=None, @@ -1398,6 +1410,7 @@ def main(): return 1 querier.apply_settings(settings) + querier.set_delay(0, options.delay) if options.cluster_id: @@ -1455,7 +1468,6 @@ def main(): # if we didn't get enough articles get remaining articles while remaining_to_get > 0: - sleep(options.delay) # set offset query.offset = offset + len(querier) @@ -1488,7 +1500,7 @@ def main(): citation_export(querier) else: txt(querier, with_globals=options.txt_globals) - + if options.cookie_file: querier.save_cookies() From 0f6639357cae7b283c396bf2c31842f42809bdf0 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sat, 10 Sep 2022 16:06:38 +0430 Subject: [PATCH 17/21] fixed bug of delay method --- scholar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scholar.py b/scholar.py index 8909cbd..5743fb7 100755 --- a/scholar.py +++ b/scholar.py @@ -163,7 +163,7 @@ import optparse import os -from random import randrange +from random import randrange, uniform import re import sys from time import sleep @@ -1237,7 +1237,7 @@ def set_delay(self, min_delay, max_delay): @property def delay(self): - return randrange(self.delay_range) + return uniform(*self.delay_range) def __len__(self): return len(self.articles) From a4195993c537ab64f72487b3cff7ff422419808b Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sat, 10 Sep 2022 16:27:51 +0430 Subject: [PATCH 18/21] added no-delay option. --- scholar.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scholar.py b/scholar.py index 5743fb7..09130e5 100755 --- a/scholar.py +++ b/scholar.py @@ -1336,6 +1336,8 @@ def main(): help='Maximum number of results to get, returns all results if is bigger than all results') group.add_option('-D', '--delay', type='float', default=2.0, help='maximum delay for each requests (it\'ll be from 0 to maximum-delay seconds), to not get banned by google because of a DOS attack! default is 2 sec') + group.add_option('--no-delay', action='store_true', default=False, + help='set delay to zero') group.add_option('--all-results', action='store_true', default=False, help='get all results') # group.add_option('-c', '--count', type='int', default=None, @@ -1410,7 +1412,10 @@ def main(): return 1 querier.apply_settings(settings) - querier.set_delay(0, options.delay) + + # add delay if user wants it. + if not options.no_delay and options.delay != 0: + querier.set_delay(0, options.delay) if options.cluster_id: From b2d1e6f280196b60f2a8261959142cbd67baedb9 Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sat, 10 Sep 2022 17:11:06 +0430 Subject: [PATCH 19/21] don't apply delay for first request. --- scholar.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scholar.py b/scholar.py index 09130e5..292ac3c 100755 --- a/scholar.py +++ b/scholar.py @@ -300,6 +300,12 @@ def __init__(self): 'url_versions': [None, 'Versions list', 8], 'url_citation': [None, 'Citation link', 9], 'excerpt': [None, 'Excerpt', 10], + 'type': [None, 'Paper type', 11], + 'journal': [None, 'Journal', 12], + 'publisher': [None, 'Publisher', 13], + 'pages': [None, 'Pages', 14], + 'volume': [None, 'Volume', 15], + 'issue': [None, 'Issue', 16], } # The citation data in one of the standard export formats, @@ -507,7 +513,7 @@ def _parse_bib(self, bib_text): # publisher={INFORMS} # } - # regexes to get any information + # regexes to get informations bib_regs = { 'type': r'@(.*){', 'title': r'title=\{(.*)\}', @@ -518,10 +524,9 @@ def _parse_bib(self, bib_text): 'publisher': r'publisher=\{(.*)\}' } - info = {} for key, reg in bib_regs.items(): - info[key] = re.search(reg, bib_text, re.IGNORECASE) + self.article[key] = re.search(reg, bib_text, re.IGNORECASE) return info @@ -1013,6 +1018,7 @@ def __init__(self): self.query = None self.cjar = MozillaCookieJar() self.delay_range = None + self.is_first_request = True # don't apply delay for first request. self.inststart = '0' self.scising = '' @@ -1204,6 +1210,10 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ + # delay for not requesting too much and get banned + if self.delay_range is not None and not self.is_first_request: + sleep(self.delay) + if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: @@ -1223,9 +1233,7 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-'*68) - # delay for not requesting too much and get banned - if self.delay_range is not None: - sleep(self.delay) + self.is_first_request = False # apply delay for next request! return html except Exception as err: From 9824b6bba066dc2ca9660d21bf5665eb3536edbb Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sun, 11 Sep 2022 01:36:55 +0430 Subject: [PATCH 20/21] added full-info option. fixed bibTex parsers bug. --- scholar.py | 91 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/scholar.py b/scholar.py index 292ac3c..6d55430 100755 --- a/scholar.py +++ b/scholar.py @@ -364,6 +364,42 @@ def as_citation(self): """ return self.citation_data or '' + def parse_bib(self): + """it'll parse a bibTex citation information and extract it's information""" + + # check if citation data exists + if self.citation_data is None: + return False + + # bibTex sample: + # @article{perold1984large, + # title={Large-scale portfolio optimization}, + # author={Perold, Andre F}, + # journal={Management science}, + # volume={30}, + # number={10}, + # pages={1143--1160}, + # year={1984}, + # publisher={INFORMS} + # } + + # regexes to get informations + bib_regs = { + 'type': r'@(.*){', + 'title': r'title=\{(.*)\}', + 'journal': r'journal=\{(.*)\}', + 'volume': r'volume=\{(.*)\}', + 'issue': r'number=\{(.*)\}', + 'pages': r'pages=\{(.*)\}', + 'publisher': r'publisher=\{(.*)\}' + } + + for key, reg in bib_regs.items(): + val = re.findall(reg, self.citation_data, re.IGNORECASE) + self[key] = val[0] if len(val) > 0 else None + + return True + class ScholarArticleParser(object): """ @@ -494,42 +530,6 @@ def _parse_links(self, span): if tag.getText().startswith('Import'): self.article['url_citation'] = tag.get('href') - def _parse_bib(self, bib_text): - """it'll parse a bibTex citation information and extract it's information""" - - # check if citation data exists - if self.article.citation_data is None: - return False - - # bibTex sample: - # @article{perold1984large, - # title={Large-scale portfolio optimization}, - # author={Perold, Andre F}, - # journal={Management science}, - # volume={30}, - # number={10}, - # pages={1143--1160}, - # year={1984}, - # publisher={INFORMS} - # } - - # regexes to get informations - bib_regs = { - 'type': r'@(.*){', - 'title': r'title=\{(.*)\}', - 'journal': r'journal=\{(.*)\}', - 'volume': r'volume=\{(.*)\}', - 'number': r'number=\{(.*)\}', - 'pages': r'pages=\{(.*)\}', - 'publisher': r'publisher=\{(.*)\}' - } - - - for key, reg in bib_regs.items(): - self.article[key] = re.search(reg, bib_text, re.IGNORECASE) - - return info - @staticmethod def _tag_has_class(tag, klass): """ @@ -1170,7 +1170,7 @@ def get_citation_data(self, article): # change to str if it's bytes if type(data) == bytes: - data = data.decode('utf-8') + data = data.decode('utf-8').replace('\\', '') # there's some useless '\' characters article.set_citation_data(data) return True @@ -1233,11 +1233,16 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) # For Python 3 ScholarUtils.log('debug', '<<<<' + '-'*68) + # check for robot check! + if "Please show you're not a robot" in html.decode('utf-8'): + ScholarUtils.log('info', err_msg + ': google recognized you as a robot!') + return None self.is_first_request = False # apply delay for next request! return html except Exception as err: ScholarUtils.log('info', err_msg + ': %s' % err) + print(err) return None def set_delay(self, min_delay, max_delay): @@ -1254,6 +1259,9 @@ def __iadd__(self, other): self.articles += other.articles return self + def __getitem__(self, num): + return self.articles[num] + def txt(querier, with_globals): if with_globals: # If we have any articles, check their attribute labels to get @@ -1365,6 +1373,8 @@ def main(): help='Like --csv, but print header with column names') group.add_option('--citation', metavar='FORMAT', default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') + group.add_option('--full-info', action='store_true', default=False, + help='get full information of an article. it\'ll retrieve more information like journal, publisher, pages, ... from bibTex. (it\'ll increase run-time for getting bibTex information)') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') @@ -1407,7 +1417,7 @@ def main(): querier = ScholarQuerier() settings = ScholarSettings() - if options.citation == 'bt': + if options.citation == 'bt' or options.full_info: settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) @@ -1505,6 +1515,11 @@ def main(): remaining_to_get = results_num_to_get - len(querier) # print(f'remaining: {remaining_to_get}') + # include bibTex information to results if user wants it + if options.full_info: + for article in querier: + article.parse_bib() + if options.csv: csv(querier) elif options.csv_header: From 60f9e4c90605563f5f998d7c07a8459a5c8ac7fd Mon Sep 17 00:00:00 2001 From: Amir Zeinali Date: Sun, 11 Sep 2022 02:37:15 +0430 Subject: [PATCH 21/21] changed delay input option. deleted some useless lines. changed robot error from info to error in debug. and fixed some minor bgs --- scholar.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/scholar.py b/scholar.py index 6d55430..a15ae27 100755 --- a/scholar.py +++ b/scholar.py @@ -163,7 +163,7 @@ import optparse import os -from random import randrange, uniform +from random import uniform import re import sys from time import sleep @@ -1235,7 +1235,7 @@ def _get_http_response(self, url, log_msg=None, err_msg=None): # check for robot check! if "Please show you're not a robot" in html.decode('utf-8'): - ScholarUtils.log('info', err_msg + ': google recognized you as a robot!') + ScholarUtils.log('error', err_msg + ': google recognized you as a robot!') return None self.is_first_request = False # apply delay for next request! @@ -1350,8 +1350,8 @@ def main(): help='Do not search, just use articles in given cluster ID') group.add_option('-m', '--max-results', type='int', default=None, help='Maximum number of results to get, returns all results if is bigger than all results') - group.add_option('-D', '--delay', type='float', default=2.0, - help='maximum delay for each requests (it\'ll be from 0 to maximum-delay seconds), to not get banned by google because of a DOS attack! default is 2 sec') + group.add_option('-D', '--delay', type='string', default=(1.0, 2.0), + help='delay range for each requests pass it as : min, max (it\'ll be delay for each request from min to max seconds), to not get banned by google because of a DOS attack! default is 1,2 sec') group.add_option('--no-delay', action='store_true', default=False, help='set delay to zero') group.add_option('--all-results', action='store_true', default=False, @@ -1432,9 +1432,9 @@ def main(): querier.apply_settings(settings) # add delay if user wants it. - if not options.no_delay and options.delay != 0: - querier.set_delay(0, options.delay) - + if not options.no_delay and options.delay != (0, 0): + options.delay = tuple(float(n) for n in options.delay.split(',')) if type(options.delay) == str else options.delay + querier.set_delay(*options.delay) if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) @@ -1502,14 +1502,8 @@ def main(): querier.send_query(query, clear=False) - # if there's a problem in getting articles go out of cycle # it can mean that there's no more articles to get. - # or got banned by google! if results_num_to_get - len(querier) == remaining_to_get: - print("WARNING: there's probably a problem for getting all requested articles.") - print(f"got {len(querier)} articles out of {results_num_to_get} articles.") - print("this means we got banned by google.") - print("or maybe there was some unavailable articles.") break remaining_to_get = results_num_to_get - len(querier)