pax_global_header00006660000000000000000000000064147436033410014517gustar00rootroot0000000000000052 comment=5aa607ccce485fc72dc4ee71d71acaf319205f75 googlesearch-1.3.0/000077500000000000000000000000001474360334100141625ustar00rootroot00000000000000googlesearch-1.3.0/.gitignore000066400000000000000000000053111474360334100161520ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/googlesearch-1.3.0/LICENSE000066400000000000000000000020441474360334100151670ustar00rootroot00000000000000MIT License Copyright (c) 2020 Nv7 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. googlesearch-1.3.0/MANIFEST.in000066400000000000000000000000311474360334100157120ustar00rootroot00000000000000include requirements.txt googlesearch-1.3.0/README.md000066400000000000000000000054521474360334100154470ustar00rootroot00000000000000# googlesearch googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. ## Installation To install, run the following command: ```bash python3 -m pip install googlesearch-python ``` ## Usage To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program: ```python from googlesearch import search search("Google") ``` ## Additional options googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program. ```python from googlesearch import search search("Google", num_results=100) ``` If you want to have unique links in your search result, you can use the `unique` option as in the following program. ```python from googlesearch import search search("Google", num_results=100, unique=True) ``` In addition, you can change the language google searches in. For example, to get results in French run the following program: ```python from googlesearch import search search("Google", lang="fr") ``` You can also specify the region ([Country Codes](https://developers.google.com/custom-search/docs/json_api_reference#countryCodes)) for your search results. For example, to get results specifically from the US run the following program: ```python from googlesearch import search search("Google", region="us") ``` If you want to turn off the safe search function (this function is on by default), you can do this: ```python from googlesearch import search search("Google", safe=None) ``` To extract more information, such as the description or the result URL, use an advanced search: ```python from googlesearch import search search("Google", advanced=True) # Returns a list of SearchResult # Properties: # - title # - url # - description ``` If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`: ```python from googlesearch import search search("Google", sleep_interval=5, num_results=200) ``` ``` If requesting more than 10 results, but want to manage the batching yourself? Use `start_num` to specify the start number of the results you want to get: ```python from googlesearch import search search("Google", sleep_interval=5, num_results=200, start_result=10) ``` If you are using a HTTP Rotating Proxy which requires you to install their CA Certificate, you can simply add `ssl_verify=False` in the `search()` method to avoid SSL Verification. ```python from googlesearch import search proxy = 'http://API:@proxy.host.com:8080/' j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=False) for i in j: print(i) ``` googlesearch-1.3.0/googlesearch/000077500000000000000000000000001474360334100166245ustar00rootroot00000000000000googlesearch-1.3.0/googlesearch/__init__.py000066400000000000000000000111261474360334100207360ustar00rootroot00000000000000"""googlesearch is a Python library for searching Google, easily.""" from time import sleep from bs4 import BeautifulSoup from requests import get from urllib.parse import unquote # to decode the url from .user_agents import get_useragent def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): resp = get( url="https://www.google.com/search", headers={ "User-Agent": get_useragent(), "Accept": "*/*" }, params={ "q": term, "num": results + 2, # Prevents multiple requests "hl": lang, "start": start, "safe": safe, "gl": region, }, proxies=proxies, timeout=timeout, verify=ssl_verify, cookies = { 'CONSENT': 'PENDING+987', # Bypasses the consent page 'SOCS': 'CAESHAgBEhIaAB', } ) resp.raise_for_status() return resp class SearchResult: def __init__(self, url, title, description): self.url = url self.title = title self.description = description def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results fetched_links = set() # to keep track of links that are already seen previously while fetched_results < num_results: # Send request resp = _req(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # put in file - comment for debugging purpose # with open('google.html', 'w') as f: # f.write(resp.text) # Parse soup = BeautifulSoup(resp.text, "html.parser") result_block = soup.find_all("div", class_="ezO2md") new_results = 0 # Keep track of new results in this iteration for result in result_block: # Find the link tag within the result block link_tag = result.find("a", href=True) # Find the title tag within the link tag title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None # Find the description tag within the result block description_tag = result.find("span", class_="FrIlee") # Check if all necessary tags are found if link_tag and title_tag and description_tag: # Extract and decode the link URL link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" # Extract and decode the link URL link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" # Check if the link has already been fetched and if unique results are required if link in fetched_links and unique: continue # Skip this result if the link is not unique # Add the link to the set of fetched links fetched_links.add(link) # Extract the title text title = title_tag.text if title_tag else "" # Extract the description text description = description_tag.text if description_tag else "" # Increment the count of fetched results fetched_results += 1 # Increment the count of new results in this iteration new_results += 1 # Yield the result based on the advanced flag if advanced: yield SearchResult(link, title, description) # Yield a SearchResult object else: yield link # Yield only the link if fetched_results >= num_results: break # Stop if we have fetched the desired number of results if new_results == 0: #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results sleep(sleep_interval) googlesearch-1.3.0/googlesearch/user_agents.py000066400000000000000000000017041474360334100215170ustar00rootroot00000000000000import random def get_useragent(): """ Generates a random user agent string mimicking the format of various software versions. The user agent string is composed of: - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2 - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15 - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5 - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9 Returns: str: A randomly generated user agent string. """ lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}" libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}" ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}" openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}" return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"googlesearch-1.3.0/requirements.txt000066400000000000000000000000431474360334100174430ustar00rootroot00000000000000beautifulsoup4>=4.9 requests>=2.20 googlesearch-1.3.0/setup.cfg000066400000000000000000000000731474360334100160030ustar00rootroot00000000000000[metadata] description-file=README.md license_files=LICENSEgooglesearch-1.3.0/setup.py000066400000000000000000000015461474360334100157020ustar00rootroot00000000000000from setuptools import setup with open("README.md", "r", encoding='UTF-8') as fh: long_description = fh.read() with open("requirements.txt", "r", encoding='UTF-8') as fh: requirements = fh.read().split("\n") setup( name="googlesearch-python", version="1.3.0", author="Nishant Vikramaditya", author_email="junk4Nv7@gmail.com", description="A Python library for scraping the Google search engine.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Nv7-GitHub/googlesearch", packages=["googlesearch"], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], python_requires=">=3.6", install_requires=[requirements], include_package_data=True, )