Skip to content
Snippets Groups Projects
Commit a3ca3f1e authored by Robin Stecher's avatar Robin Stecher
Browse files

Improved documentation for url_grabber.py

parent f78d1a07
No related branches found
No related tags found
1 merge request!1Dev
......@@ -3,12 +3,17 @@ from bs4 import BeautifulSoup
def get_urls_from_website(url: str) -> [str]:
"""
Method fetches all href urls on the specified url by scraping the website from url
:param url: The url where the urls should be loaded from
:return: Returns a [str] array of href urls on the given website url
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
page = requests.get(url, allow_redirects=True, headers=headers)
soup = BeautifulSoup(page.text, features="html.parser")
all_link_elements = soup.find_all("a", href=True)
page = requests.get(url, allow_redirects=True, headers=headers) # get website
soup = BeautifulSoup(page.text, features="html.parser") # parse to bs4
all_link_elements = soup.find_all("a", href=True) # get all <a> tag elements
hrefs = []
for a in all_link_elements:
hrefs.append(a['href'])
for a in all_link_elements: # Loop through <a> elements
hrefs.append(a['href']) # Collect url
return hrefs
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment