Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • dev
  • main
2 results

Target

Select target project
  • robinst/university_analysis
1 result
Select Git revision
  • dev
  • main
2 results
Show changes

Commits on Source 7

No preview for this file type
assets/cacert.pem
\ No newline at end of file
......@@ -17,6 +17,13 @@ python3 main.py
The list of the university URLs [can be
found here.](https://gitlab.fachschaften.org/robinst/university_analysis/-/blob/main/assets/university_main_pages.json?ref_type=heads)
In an older version the ssl certificates got
checked without an api. This method contained some
bugs, so the api is used now. When you want to
still recode the code base to use the old code snippet,
you might need [the certificate .pam list](http://curl.haxx.se/ca/cacert.pem).
Please place it in the assets directory named ```cacert.pem```.
## Abstract of the research subject
......
File added
Source diff could not be displayed: it is too large. Options to address this: view the blob.
from src.detect_ssl import *
import json
from src.url_grabber import get_urls_from_website
def check_url_list_for_ssl(url_input_list_path: str):
with open(f'{url_input_list_path}', 'rb') as f:
data = f.read()
parsed = json.loads(data)
universities = parsed['universities']
for university in universities:
website_url = university['website']
if not check_ssl_with_api(website_url):
print(f'Website {website_url} seems to have no certificate!')
def get_url_list(url_input_list_path: str) -> [str]:
with open(f'{url_input_list_path}', 'rb') as f:
data = f.read()
parsed = json.loads(data)
universities: [str] = []
for university in parsed['universities']:
universities.append(university['website'])
return universities
def get_all_urls_from_websites(url_list: [str]):
all_urls: [str] = []
for url in url_list:
urls_from_website: [str] = get_urls_from_website(url)
all_urls = all_urls + urls_from_website
return list(filter(filter_urls, all_urls))
def filter_urls(current_url: str) -> bool:
return current_url.startswith('https://') or current_url.startswith('http://')
def filter_http_urls(current_url: str) -> bool:
return current_url.startswith('http://')
def get_only_http_domains(url_list: [str]) -> [str]:
return list(filter(filter_http_urls, url_list))
def count_urls_in_list(url_list: [str]) -> int:
return len(url_list)
from src.url_list_processor import get_url_list
from src.modules.count_url_list import count_urls_in_list
from src.modules.ssl_check import check_url_list_for_ssl
from src.modules.get_urls_from_website import get_all_urls_from_websites, get_only_http_domains
def parse_text_to_number(text_input: str) -> int:
"""
Parses text input of str to an integer value, if it is a number
:param text_input: The number as str
:return: Returns the number as integer
"""
if not str.isnumeric(text_input):
raise Exception('Given string is not a number!')
return int(text_input)
def mode_count_in_list(university_list: [str]) -> None:
"""
Executing the module for counting the urls in the university list
:param university_list: The list of university urls
"""
print(count_urls_in_list(university_list))
def mode_check_for_ssl(university_list_path: str) -> None:
"""
Executing the module for checking all university urls in list for ssl encryption
:param university_list_path: The path to the university list
"""
print(processing_message)
check_url_list_for_ssl(university_list_path)
def mode_get_urls_from_website(university_list: [str]) -> None:
"""
Executing module: Fetches all urls listed on the given url website array
:param university_list: The list where the urls should be fetched from
"""
print(processing_message)
all_urls = get_all_urls_from_websites(university_list)
print(f'All urls: {all_urls}')
print(f'Only http urls: {get_only_http_domains(all_urls)}')
def print_main_menu() -> None:
"""
Prints the main menu of the python script
"""
path_to_url_list = 'assets/university_main_pages.json'
university_list = get_url_list(path_to_url_list)
print(f'===== Welcome to the ssl check script! =====')
print(f'== (Quit: ctrl + c) ==')
print('Which module do you want to run?')
print('0: Count the urls in url list')
print('1: Check all urls of the url list for ssl encryption certificate')
print('2: Get all urls from the university list websites')
chosen_mode = input('Select the mode: ')
mode = parse_text_to_number(chosen_mode)
# Choose correct mode
if mode == 0: # count items in url list
mode_count_in_list(university_list)
elif mode == 1: # Check website for ssl
mode_check_for_ssl(path_to_url_list)
elif mode == 2: # Get all urls from websites
mode_get_urls_from_website(university_list)
else:
print('This mode does not exist!')
if __name__ == '__main__':
# print('Hello World!')
# check_url_list_for_ssl('assets/university_main_pages.json')
# print(check_ssl_with_api('http://httpforever.com'))
# get_urls_from_website('https://stecher42.de')
university_list = get_url_list('assets/university_main_pages.json')
# university_list = ['https://stecher42.de', 'https://go.stecher42.de']
# result_list = get_all_urls_from_websites(university_list)
# print(result_list)
# print(get_only_http_domains(result_list))
print(f'Count of universities: {count_urls_in_list(university_list)}')
processing_message = 'Processing... This may take a while'
while True:
print_main_menu()
......@@ -5,6 +5,19 @@ import requests
def check_for_ssl(url: str, check_for_correct_name=False) -> bool:
"""
Check if a website has a valid SSL certificate
:param url: The url to the website which should be checked
:param check_for_correct_name: Compares the host to the certificate host and throws
an error when it is not the same
:return: Returns true, if the certificate is valid, returns false, if there came up
problems during certificate check
"""
# Transparency note
# Some parts of this method are inspired by the following Stackoverflow answer by
# Ricardo Altamirano (16th July 2012)
# https://stackoverflow.com/questions/9099349/is-there-an-easy-way-to-check-if-a-website-has-an-ssl-certificate
hostname = decode_url_to_hostname(url) # addr info needs hostname
port: int = 443
ip_address: str = get_ip_address(hostname, port)
......@@ -13,15 +26,15 @@ def check_for_ssl(url: str, check_for_correct_name=False) -> bool:
print(f'Port 443 for SSL is not open for {url} (Hostname: {hostname})!')
return False
my_socket = socket.socket()
my_socket.connect((ip_address, port))
my_socket = socket.socket() # Create a socket for the cert check request
my_socket.connect((ip_address, port)) # Passing the host's data
try:
try: # Wrap with ssl socket to check for certificate
my_socket = ssl.wrap_socket(my_socket,
cert_reqs=ssl.CERT_REQUIRED,
ca_certs="assets/cacert.pem"
)
except Exception as e:
except Exception as e: # Could not determine a valid certificate
print(f'Could not wrap certificate for {url} (Hostname: {hostname}): {e}')
return False
......@@ -40,10 +53,21 @@ def check_for_ssl(url: str, check_for_correct_name=False) -> bool:
def get_ip_address(hostname: str, port: int) -> str:
"""
Get the ipv4 address of a hostname by passing hostname and port
:param hostname: The hostname of the machine
:param port: The open port of the machine
:return: Returns the ipv4 address as a String
"""
return socket.getaddrinfo(hostname, port)[0][4][0] # get IPv4 address from domain
def decode_url_to_hostname(url: str) -> str:
"""
Get the hostname from an url
:param url: The url where you want to get the hostname from
:return: Returns the hostname
"""
if url.startswith('https://') or url.startswith('http://'): # remove protocol prefix
url = url.split('//')[1]
parts = url.split('.') # get the parts of the url which are separated by '.'
......@@ -56,6 +80,12 @@ def decode_url_to_hostname(url: str) -> str:
def check_if_port_is_open(ipv4: str, port: int) -> bool:
"""
Checks, if a specific port is open on host with given ip4 address
:param ipv4: The ipv4 address to the specific host
:param port: THe port which status should be checked for being open
:return: Returns true, if the port is open, either false
"""
my_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
port_state = my_socket.connect_ex((ipv4, port))
result = port_state == 0 # state 0 means open, 1 is closed
......@@ -63,14 +93,24 @@ def check_if_port_is_open(ipv4: str, port: int) -> bool:
return result
def request_ssl_api(url):
def request_ssl_api(url) -> str:
"""
Check a website for ssl certificate by accessing a ssl check api
:param url: The url to the website which should be checked for a valid certificate
:return: Returns the api response text as a String
"""
result = requests.get(f'https://ssl-checker.io/api/v1/check/{decode_url_to_hostname(url)}')
return result.text
def check_ssl_with_api(url):
def check_ssl_with_api(url) -> bool:
"""
Checks the specific website url for a valid SSL certificate on port 443
:param url: The url to website which should be checked for a valid SSL certificate
:return: Returns true, if the cert is valid, either false
"""
port: int = 443
if not check_if_port_is_open(get_ip_address(decode_url_to_hostname(url), port), port):
return False
json_result = json.loads(request_ssl_api(url))
return json_result['status'] == 'ok'
return False # Port for https is closed, no certificate valid
json_result = json.loads(request_ssl_api(url)) # Fetch api for certificate
return json_result['status'] == 'ok' # Check if certificate is valid (processing resp)
def count_urls_in_list(url_list: [str]) -> int:
return len(url_list)
\ No newline at end of file
from src.url_grabber import get_urls_from_website
def get_all_urls_from_websites(url_list: [str]):
all_urls: [str] = []
for url in url_list:
urls_from_website: [str] = get_urls_from_website(url)
all_urls = all_urls + urls_from_website
return list(filter(filter_urls, all_urls))
def filter_urls(current_url: str) -> bool:
return current_url.startswith('https://') or current_url.startswith('http://')
def filter_http_urls(current_url: str) -> bool:
return current_url.startswith('http://')
def get_only_http_domains(url_list: [str]) -> [str]:
return list(filter(filter_http_urls, url_list))
\ No newline at end of file
import json
from src.detect_ssl import check_ssl_with_api
def check_url_list_for_ssl(url_input_list_path: str):
with open(f'{url_input_list_path}', 'rb') as f:
data = f.read()
parsed = json.loads(data)
universities = parsed['universities']
for university in universities:
website_url = university['website']
if not check_ssl_with_api(website_url):
print(f'Website {website_url} seems to have no certificate!')
\ No newline at end of file
......@@ -3,12 +3,17 @@ from bs4 import BeautifulSoup
def get_urls_from_website(url: str) -> [str]:
"""
Method fetches all href urls on the specified url by scraping the website from url
:param url: The url where the urls should be loaded from
:return: Returns a [str] array of href urls on the given website url
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
page = requests.get(url, allow_redirects=True, headers=headers)
soup = BeautifulSoup(page.text, features="html.parser")
all_link_elements = soup.find_all("a", href=True)
page = requests.get(url, allow_redirects=True, headers=headers) # get website
soup = BeautifulSoup(page.text, features="html.parser") # parse to bs4
all_link_elements = soup.find_all("a", href=True) # get all <a> tag elements
hrefs = []
for a in all_link_elements:
hrefs.append(a['href'])
for a in all_link_elements: # Loop through <a> elements
hrefs.append(a['href']) # Collect url
return hrefs
import json
def get_url_list(url_input_list_path: str) -> [str]:
with open(f'{url_input_list_path}', 'rb') as f:
data = f.read()
parsed = json.loads(data)
universities: [str] = []
for university in parsed['universities']:
universities.append(university['website'])
return universities