Compare revisions

Robin Stecher · Robin Stecher · Robin Stecher · Robin Stecher · Robin Stecher · Robin Stecher
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
+assets/cacert.pem
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -17,6 +17,13 @@ python3 main.py
 The list of the university URLs [can be 
 found here.](https://gitlab.fachschaften.org/robinst/university_analysis/-/blob/main/assets/university_main_pages.json?ref_type=heads)

+In an older version the ssl certificates got 
+checked without an api. This method contained some 
+bugs, so the api is used now. When you want to 
+still recode the code base to use the old code snippet, 
+you might need [the certificate .pam list](http://curl.haxx.se/ca/cacert.pem). 
+Please place it in the assets directory named ```cacert.pem```.
+

 ## Abstract of the research subject


--- a/assets/.DS_Store
+++ b/assets/.DS_Store
--- a/assets/cacert.pem
+++ b/assets/cacert.pem
--- a/main.py
+++ b/main.py
-from src.detect_ssl import *
-import json
-from src.url_grabber import get_urls_from_website
-
-
-def check_url_list_for_ssl(url_input_list_path: str):
-    with open(f'{url_input_list_path}', 'rb') as f:
-        data = f.read()
-        parsed = json.loads(data)
-        universities = parsed['universities']
-        for university in universities:
-            website_url = university['website']
-            if not check_ssl_with_api(website_url):
-                print(f'Website {website_url} seems to have no certificate!')
-
-
-def get_url_list(url_input_list_path: str) -> [str]:
-    with open(f'{url_input_list_path}', 'rb') as f:
-        data = f.read()
-        parsed = json.loads(data)
-        universities: [str] = []
-        for university in parsed['universities']:
-            universities.append(university['website'])
-        return universities
-
-
-def get_all_urls_from_websites(url_list: [str]):
-    all_urls: [str] = []
-    for url in url_list:
-        urls_from_website: [str] = get_urls_from_website(url)
-        all_urls = all_urls + urls_from_website
-    return list(filter(filter_urls, all_urls))
-
-
-def filter_urls(current_url: str) -> bool:
-    return current_url.startswith('https://') or current_url.startswith('http://')
-
-
-def filter_http_urls(current_url: str) -> bool:
-    return current_url.startswith('http://')
-
-
-def get_only_http_domains(url_list: [str]) -> [str]:
-    return list(filter(filter_http_urls, url_list))
-
-
-def count_urls_in_list(url_list: [str]) -> int:
-    return len(url_list)
+from src.url_list_processor import get_url_list
+from src.modules.count_url_list import count_urls_in_list
+from src.modules.ssl_check import check_url_list_for_ssl
+from src.modules.get_urls_from_website import get_all_urls_from_websites, get_only_http_domains
+
+
+def parse_text_to_number(text_input: str) -> int:
+    """
+    Parses text input of str to an integer value, if it is a number
+    :param text_input: The number as str
+    :return: Returns the number as integer
+    """
+    if not str.isnumeric(text_input):
+        raise Exception('Given string is not a number!')
+    return int(text_input)
+
+
+def mode_count_in_list(university_list: [str]) -> None:
+    """
+    Executing the module for counting the urls in the university list
+    :param university_list: The list of university urls
+    """
+    print(count_urls_in_list(university_list))
+
+
+def mode_check_for_ssl(university_list_path: str) -> None:
+    """
+    Executing the module for checking all university urls in list for ssl encryption
+    :param university_list_path: The path to the university list
+    """
+    print(processing_message)
+    check_url_list_for_ssl(university_list_path)
+
+
+def mode_get_urls_from_website(university_list: [str]) -> None:
+    """
+    Executing module: Fetches all urls listed on the given url website array
+    :param university_list: The list where the urls should be fetched from
+    """
+    print(processing_message)
+    all_urls = get_all_urls_from_websites(university_list)
+    print(f'All urls: {all_urls}')
+    print(f'Only http urls: {get_only_http_domains(all_urls)}')
+
+
+def print_main_menu() -> None:
+    """
+    Prints the main menu of the python script
+    """
+    path_to_url_list = 'assets/university_main_pages.json'
+    university_list = get_url_list(path_to_url_list)
+    print(f'===== Welcome to the ssl check script! =====')
+    print(f'==          (Quit: ctrl + c)              ==')
+    print('Which module do you want to run?')
+    print('0: Count the urls in url list')
+    print('1: Check all urls of the url list for ssl encryption certificate')
+    print('2: Get all urls from the university list websites')
+    chosen_mode = input('Select the mode: ')
+    mode = parse_text_to_number(chosen_mode)
+
+    # Choose correct mode
+    if mode == 0:       # count items in url list
+        mode_count_in_list(university_list)
+    elif mode == 1:     # Check website for ssl
+        mode_check_for_ssl(path_to_url_list)
+    elif mode == 2:     # Get all urls from websites
+        mode_get_urls_from_website(university_list)
+    else:
+        print('This mode does not exist!')


 if __name__ == '__main__':
-    # print('Hello World!')
-    # check_url_list_for_ssl('assets/university_main_pages.json')
-    # print(check_ssl_with_api('http://httpforever.com'))
-    # get_urls_from_website('https://stecher42.de')
-
-    university_list = get_url_list('assets/university_main_pages.json')
-    # university_list = ['https://stecher42.de', 'https://go.stecher42.de']
-    # result_list = get_all_urls_from_websites(university_list)
-    # print(result_list)
-    # print(get_only_http_domains(result_list))
-    print(f'Count of universities: {count_urls_in_list(university_list)}')
+    processing_message = 'Processing... This may take a while'
+    while True:
+        print_main_menu()


--- a/src/detect_ssl.py
+++ b/src/detect_ssl.py
@@ -5,6 +5,19 @@ import requests


 def check_for_ssl(url: str, check_for_correct_name=False) -> bool:
+    """
+    Check if a website has a valid SSL certificate
+    :param url: The url to the website which should be checked
+    :param check_for_correct_name: Compares the host to the certificate host and throws
+    an error when it is not the same
+    :return: Returns true, if the certificate is valid, returns false, if there came up
+    problems during certificate check
+    """
+    # Transparency note
+    # Some parts of this method are inspired by the following Stackoverflow answer by
+    # Ricardo Altamirano (16th July 2012)
+    # https://stackoverflow.com/questions/9099349/is-there-an-easy-way-to-check-if-a-website-has-an-ssl-certificate
+
    hostname = decode_url_to_hostname(url)  # addr info needs hostname
    port: int = 443
    ip_address: str = get_ip_address(hostname, port)
@@ -13,15 +26,15 @@ def check_for_ssl(url: str, check_for_correct_name=False) -> bool:
        print(f'Port 443 for SSL is not open for {url} (Hostname: {hostname})!')
        return False

-    my_socket = socket.socket()
-    my_socket.connect((ip_address, port))
+    my_socket = socket.socket()                     # Create a socket for the cert check request
+    my_socket.connect((ip_address, port))           # Passing the host's data

-    try:
+    try:                                            # Wrap with ssl socket to check for certificate
        my_socket = ssl.wrap_socket(my_socket,
                                    cert_reqs=ssl.CERT_REQUIRED,
                                    ca_certs="assets/cacert.pem"
                                    )
-    except Exception as e:
+    except Exception as e:                          # Could not determine a valid certificate
        print(f'Could not wrap certificate for {url} (Hostname: {hostname}): {e}')
        return False

@@ -40,10 +53,21 @@ def check_for_ssl(url: str, check_for_correct_name=False) -> bool:


 def get_ip_address(hostname: str, port: int) -> str:
+    """
+    Get the ipv4 address of a hostname by passing hostname and port
+    :param hostname: The hostname of the machine
+    :param port: The open port of the machine
+    :return: Returns the ipv4 address as a String
+    """
    return socket.getaddrinfo(hostname, port)[0][4][0]    # get IPv4 address from domain


 def decode_url_to_hostname(url: str) -> str:
+    """
+    Get the hostname from an url
+    :param url: The url where you want to get the hostname from
+    :return: Returns the hostname
+    """
    if url.startswith('https://') or url.startswith('http://'):  # remove protocol prefix
        url = url.split('//')[1]
    parts = url.split('.')  # get the parts of the url which are separated by '.'
@@ -56,6 +80,12 @@ def decode_url_to_hostname(url: str) -> str:


 def check_if_port_is_open(ipv4: str, port: int) -> bool:
+    """
+    Checks, if a specific port is open on host with given ip4 address
+    :param ipv4: The ipv4 address to the specific host
+    :param port: THe port which status should be checked for being open
+    :return: Returns true, if the port is open, either false
+    """
    my_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    port_state = my_socket.connect_ex((ipv4, port))
    result = port_state == 0    # state 0 means open, 1 is closed
@@ -63,14 +93,24 @@ def check_if_port_is_open(ipv4: str, port: int) -> bool:
    return result


-def request_ssl_api(url):
+def request_ssl_api(url) -> str:
+    """
+    Check a website for ssl certificate by accessing a ssl check api
+    :param url: The url to the website which should be checked for a valid certificate
+    :return: Returns the api response text as a String
+    """
    result = requests.get(f'https://ssl-checker.io/api/v1/check/{decode_url_to_hostname(url)}')
    return result.text


-def check_ssl_with_api(url):
+def check_ssl_with_api(url) -> bool:
+    """
+    Checks the specific website url for a valid SSL certificate on port 443
+    :param url: The url to website which should be checked for a valid SSL certificate
+    :return: Returns true, if the cert is valid, either false
+    """
    port: int = 443
    if not check_if_port_is_open(get_ip_address(decode_url_to_hostname(url), port), port):
-        return False
-    json_result = json.loads(request_ssl_api(url))
-    return json_result['status'] == 'ok'
+        return False            # Port for https is closed, no certificate valid
+    json_result = json.loads(request_ssl_api(url))      # Fetch api for certificate
+    return json_result['status'] == 'ok'                # Check if certificate is valid (processing resp)
--- a/src/modules/count_url_list.py
+++ b/src/modules/count_url_list.py
+def count_urls_in_list(url_list: [str]) -> int:
+    return len(url_list)
\ No newline at end of file
--- a/src/modules/get_urls_from_website.py
+++ b/src/modules/get_urls_from_website.py
+from src.url_grabber import get_urls_from_website
+
+
+def get_all_urls_from_websites(url_list: [str]):
+    all_urls: [str] = []
+    for url in url_list:
+        urls_from_website: [str] = get_urls_from_website(url)
+        all_urls = all_urls + urls_from_website
+    return list(filter(filter_urls, all_urls))
+
+
+def filter_urls(current_url: str) -> bool:
+    return current_url.startswith('https://') or current_url.startswith('http://')
+
+
+def filter_http_urls(current_url: str) -> bool:
+    return current_url.startswith('http://')
+
+
+def get_only_http_domains(url_list: [str]) -> [str]:
+    return list(filter(filter_http_urls, url_list))
\ No newline at end of file
--- a/src/modules/ssl_check.py
+++ b/src/modules/ssl_check.py
+import json
+from src.detect_ssl import check_ssl_with_api
+
+
+def check_url_list_for_ssl(url_input_list_path: str):
+    with open(f'{url_input_list_path}', 'rb') as f:
+        data = f.read()
+        parsed = json.loads(data)
+        universities = parsed['universities']
+        for university in universities:
+            website_url = university['website']
+            if not check_ssl_with_api(website_url):
+                print(f'Website {website_url} seems to have no certificate!')
\ No newline at end of file
--- a/src/url_grabber.py
+++ b/src/url_grabber.py
@@ -3,12 +3,17 @@ from bs4 import BeautifulSoup


 def get_urls_from_website(url: str) -> [str]:
+    """
+    Method fetches all href urls on the specified url by scraping the website from url
+    :param url: The url where the urls should be loaded from
+    :return: Returns a [str] array of href urls on the given website url
+    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
-    page = requests.get(url, allow_redirects=True, headers=headers)
-    soup = BeautifulSoup(page.text, features="html.parser")
-    all_link_elements = soup.find_all("a", href=True)
+    page = requests.get(url, allow_redirects=True, headers=headers)     # get website
+    soup = BeautifulSoup(page.text, features="html.parser")             # parse to bs4
+    all_link_elements = soup.find_all("a", href=True)             # get all <a> tag elements
    hrefs = []
-    for a in all_link_elements:
-        hrefs.append(a['href'])
+    for a in all_link_elements:                                         # Loop through <a> elements
+        hrefs.append(a['href'])                                         # Collect url
    return hrefs
--- a/src/url_list_processor.py
+++ b/src/url_list_processor.py
+import json
+
+
+def get_url_list(url_input_list_path: str) -> [str]:
+    with open(f'{url_input_list_path}', 'rb') as f:
+        data = f.read()
+        parsed = json.loads(data)
+        universities: [str] = []
+        for university in parsed['universities']:
+            universities.append(university['website'])
+        return universities
+
No results found