Improved documentation for url_grabber.py

a3ca3f1e · Robin Stecher · f78d1a07 · a3ca3f1e
Commit a3ca3f1e authored Apr 1, 2024 by Robin Stecher
--- a/src/url_grabber.py
+++ b/src/url_grabber.py
@@ -3,12 +3,17 @@ from bs4 import BeautifulSoup


 def get_urls_from_website(url: str) -> [str]:
+    """
+    Method fetches all href urls on the specified url by scraping the website from url
+    :param url: The url where the urls should be loaded from
+    :return: Returns a [str] array of href urls on the given website url
+    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
-    page = requests.get(url, allow_redirects=True, headers=headers)
-    soup = BeautifulSoup(page.text, features="html.parser")
-    all_link_elements = soup.find_all("a", href=True)
+    page = requests.get(url, allow_redirects=True, headers=headers)     # get website
+    soup = BeautifulSoup(page.text, features="html.parser")             # parse to bs4
+    all_link_elements = soup.find_all("a", href=True)             # get all <a> tag elements
    hrefs = []
-    for a in all_link_elements:
-        hrefs.append(a['href'])
+    for a in all_link_elements:                                         # Loop through <a> elements
+        hrefs.append(a['href'])                                         # Collect url
    return hrefs