Skip to content
Snippets Groups Projects
Commit ec7d6150 authored by Robin Ebbinghaus's avatar Robin Ebbinghaus Committed by Jonas Röger
Browse files

feat(TextBlockCleaner): :sparkles: add TextBlockCleaner

parent ae308487
No related branches found
No related tags found
No related merge requests found
Pipeline #256029 passed
......@@ -6,14 +6,31 @@ from contextlib import redirect_stdout
from database_server.modules.cleaners.basic_cleaner import BasicCleaner
from database_server.modules.cleaners.test_cases import file_location, test_cases
from database_server.modules.cleaners.text_block_cleaner import TextBlockCleaner
from database_server.modules.filereader.pdf_reader_enhanced import PDFReader
class CleanerTester:
def __init__(self) -> None:
pass
ensure_testcases_downloaded()
def test_cleaner(self, cleaner, cleaner_args):
reader = PDFReader()
for test_case in test_cases:
# read test file
path = "database-server/input/" + test_case["file_name"]
doc = reader.read_file(path, {"computer_vision": False})
# clean file and log cleaner output
with io.StringIO() as buf, redirect_stdout(buf):
cleaner.clean_document(doc, cleaner_args)
log = buf.getvalue()
check_log(test_case, log)
@staticmethod
def ensure_testcases_downloaded():
# download files if not already present
for test_case in test_cases:
file_name = test_case["file_name"]
......@@ -25,26 +42,18 @@ class CleanerTester:
except:
raise Exception(f"could not download {file_name}")
# test cleaner
reader = PDFReader()
for test_case in test_cases:
# read test file
path = "database-server/input/" + test_case["file_name"]
doc = reader.read_file(path, {"computer_vision": False})
@staticmethod
def check_log(test_case, log):
# get relevant info from dict
recurrent_strings_dict = test_case["recurrent_strings"]
first_page_nr = test_case["first_page_number"]
pages_without_number = test_case["pages_without_numbers"]
page_numbers = test_case["page_numbers"]
number_pages = test_case["number_pages"]
redundant_slides = test_case["redundant_slides"]
prefix, postfix = test_case["prefix"], test_case["postfix"]
# clean file and log cleaner output
with io.StringIO() as buf, redirect_stdout(buf):
cleaner.clean_document(doc, cleaner_args)
log = buf.getvalue()
# extract info from log
log_info = []
for line in log.split("\n"):
......@@ -59,12 +68,27 @@ class CleanerTester:
# where two recurrent strings where treated as one
for rs in recurrent_strings_dict.keys():
if rs in removed_string and rs != removed_string:
log_info.append(("substring", log_page, rs))
log_info.append(("substring split", log_page, rs))
removed_string = removed_string.replace(rs, "").strip()
log_info.append((log_type, log_page, removed_string))
# for entry in log_info:
# print(entry)
# check if wrong strings were removed
removed_pages = [] # has te be calculated first, for offset
for log_type, log_page, removed_string in log_info:
if log_type == "redundant page":
removed_pages.append(log_page)
if log_page not in redundant_slides:
print(f"page {log_page} was wrongly removed")
else:
print(f"check: removed page {log_page}")
offsets = [
sum(1 for pg_nr in removed_pages if pg_nr < i) for i in range(number_pages)
]
offsets = [o for i, o in enumerate(offsets) if i not in removed_pages]
removed_page_number_on = []
removed_recurrent_string_on = {
string: [] for string in recurrent_strings_dict.keys()
......@@ -91,7 +115,12 @@ class CleanerTester:
else:
out = f"removed '{removed_string}' instead of '{target_string}'"
print(out)
elif log_type == "substring":
elif log_type == "substring" or log_type == "substring split":
if removed_string.strip() == "":
continue
if log_type == "substring":
# offset = sum(1 for pg_nr in removed_pages if pg_nr < log_page)
log_page += offsets[log_page]
page_nr_list = recurrent_strings_dict.get(removed_string)
if page_nr_list is None or log_page not in page_nr_list:
print(f"'{removed_string}' was wrongly removed on {log_page}")
......@@ -110,7 +139,8 @@ class CleanerTester:
# check if recurrent sub string that should be removed was removed
for substring, pages in recurrent_strings_dict.items():
not_removed_pages = list(
set(pages) - set(removed_recurrent_string_on[substring])
(set(pages) - set(removed_recurrent_string_on[substring]))
- set(removed_pages)
)
for page_nr in not_removed_pages:
out = (
......@@ -118,12 +148,19 @@ class CleanerTester:
+ f"page {first_page_nr + page_nr} but wasn't"
)
print(out)
# check if every page, that should be removed, was removed
for i in range(number_pages):
if i not in removed_pages and i in redundant_slides:
print(f"page {i} should have been removed but wasn't")
print()
if __name__ == "__main__":
basic_cleaner = False
if basic_cleaner:
cleaner = BasicCleaner()
else:
cleaner = TextBlockCleaner()
cleaner_tester = CleanerTester()
cleaner_args = {"verbose": True}
cleaner_tester.test_cleaner(cleaner, cleaner_args)
......@@ -8,13 +8,14 @@ test_cases = [
"page_numbers": [None] + list(range(2, 15)),
"prefix": "13:",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"M. Arenas and G. I. Diaz": list(range(1, 14, 2)),
"ACM Transactions on Database Systems, Vol. 41, No. 2, Article 13, Publication date: May 2016.": list(
range(14)
),
"The Exact Complexity of the First-Order Logic Definability Problem": list(
range(0, 14, 2)
"The Exact Complexity of the First-Order Logic Denability Problem": list(
range(2, 14, 2)
),
},
},
......@@ -26,11 +27,11 @@ test_cases = [
"page_numbers": [None] + list(range(2, 18)),
"prefix": "XX:",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"The Complexity of Reverse Engineering Problems for Conjunctive Queries": [
0
]
+ list(range(1, 17, 2)),
"The Complexity of Reverse Engineering Problems for Conjunctive Queries": list(
range(1, 17, 2)
),
"P. Barceló and M. Romero": list(range(2, 17, 2)),
},
},
......@@ -42,6 +43,7 @@ test_cases = [
"page_numbers": list(range(109, 121)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -52,6 +54,7 @@ test_cases = [
"page_numbers": [None for _ in range(10)],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -62,6 +65,7 @@ test_cases = [
"page_numbers": list(range(1134, 1143)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"Communications of the ACM": list(range(9)),
"November 1984 Volume 27 Number 11": list(range(9)),
......@@ -76,6 +80,7 @@ test_cases = [
"page_numbers": list(range(1, 26)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -86,6 +91,7 @@ test_cases = [
"page_numbers": [None] + list(range(2, 8)) + [None],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -96,6 +102,7 @@ test_cases = [
"page_numbers": [None for _ in range(11)],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -106,6 +113,7 @@ test_cases = [
"page_numbers": list(range(1, 50)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -116,6 +124,7 @@ test_cases = [
"page_numbers": list(range(1, 5)),
"prefix": "Page ",
"postfix": " of 4",
"redundant_slides": [],
"recurrent_strings": {"PDF Bookmark Sample": list(range(4))},
},
{
......@@ -126,6 +135,7 @@ test_cases = [
"page_numbers": [None, 2, 3, None],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
{
......@@ -136,6 +146,7 @@ test_cases = [
"page_numbers": list(range(13, 30)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"R. STANLEY, HYPERPLANE ARRANGEMENTS": list(range(1, 17, 2)),
"LECTURE 2. PROPERTIES OF THE INTERSECTION POSET": list(range(2, 17, 2)),
......@@ -149,8 +160,128 @@ test_cases = [
"page_numbers": [1, 2, 3, 4, None],
"prefix": "1-",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {},
},
# slides
{
"file_name": "1926c83ecd7ea700f7cb63914c6d7c0f_MIT18_S096F13_lecnote8.pdf",
"first_page_number": 1,
"number_pages": 33,
"pages_without_numbers": [32],
"page_numbers": list(range(1, 33)) + [None],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"Time Series Analysis": list(range(32)),
"MIT 18.S096": list(range(32)),
"Stationarity and Wold Representation Theorem Autoregressive and Moving Average (ARMA) Models Accommodating Non-Stationarity: ARIMA Models Estimation of Stationary ARMA Models Tests for Stationarity/Non-Stationarity": list(
range(1, 32)
),
},
},
{
"file_name": "MAEVS1.pdf",
"first_page_number": 1,
"number_pages": 44,
"pages_without_numbers": [],
"page_numbers": list(range(1, 45)),
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"\\uf0d3 Peter Buchholz 2020": list(range(44)),
"Modellierung eingebetteter und verteilter Systeme": list(range(44)),
"Kap. 1 Einleitung und Übersicht": list(range(44)),
},
},
{
"file_name": "mit18_05_s22_lec01.pdf",
"first_page_number": 1,
"number_pages": 34,
"pages_without_numbers": [],
"page_numbers": list(range(1, 21))
+ [21, 21]
+ list(range(22, 26))
+ [26, 26]
+ list(range(27, 33)),
"prefix": "",
"postfix": "/32",
"redundant_slides": [20, 26],
"recurrent_strings": {},
},
{
"file_name": "Praesentation.pdf",
"first_page_number": 1,
"number_pages": 102,
"pages_without_numbers": [],
"page_numbers": [1]
+ [2 for _ in range(6)]
+ list(range(3, 9))
+ [9 for _ in range(9)]
+ [10, 11, 12, 13, 13, 14, 14]
+ list(range(15, 20))
+ [20, 20, 20, 21, 22, 23, 23, 24, 25, 26, 26]
+ list(range(27, 41))
+ [41 for _ in range(6)]
+ [42, 42]
+ list(range(43, 47))
+ [47, 47, 47, 48, 49, 49, 49, 49, 49, 50, 50, 51, 52, 53, 53]
+ list(range(54, 70)),
"prefix": "",
"postfix": "/69",
"redundant_slides": list(range(1, 6))
+ list(range(13, 21))
+ [25, 27, 34, 35, 39, 43]
+ list(range(59, 64))
+ [65, 71, 72]
+ list(range(75, 79))
+ [80, 84],
"recurrent_strings": {
"Robin Ebbinghaus | Proseminar „Convolutional Neural Networks - Methoden und Anwendungen“": list(
range(102)
),
"Aufbau eines Feed-Forward Neural Networks Matrixrepräsentation eines Feed Forward Neural Networks Training eines 1-Neuron Netz": list(
range(102)
),
},
},
{
"file_name": "Proseminar_DeepLearning-Folienvorlage_LaTeX-Beamer.pdf",
"first_page_number": 1,
"number_pages": 21,
"pages_without_numbers": [],
"page_numbers": list(range(1, 9)) + [9 for _ in range(10)] + [10, 11, 12],
"prefix": "",
"postfix": "/12",
"redundant_slides": list(range(8, 17)),
"recurrent_strings": {
"Name / Autor | Proseminar „Convolutional Neural Networks - Methoden und Anwendungen“": list(
range(21)
),
"Introduction Basic structuring Fancy features": list(range(21)),
},
},
{
"file_name": "02 Business-IT Alignment.pdf",
"first_page_number": 1,
"number_pages": 52,
"pages_without_numbers": [0, 3, 29, 51],
"page_numbers": [None, 2, 3, None]
+ list(range(5, 30))
+ [None]
+ list(range(31, 52))
+ [None],
"prefix": "",
"postfix": "",
"redundant_slides": [],
"recurrent_strings": {
"Christian Janiesch | IT Management": list(range(52)),
"Chair of Enterprise Computing": list(range(52)),
"Department of Computer Science": list(range(52)),
},
},
]
# link to the storage, where the documents are to be found
......
This diff is collapsed.
......@@ -108,6 +108,9 @@ class PDFReader(AbstractFilereader):
)
# Create segments without cv
# if filereader_args["clean_text_boxes"]:
# segments = TextBlockCleaner.get_cleaned_segments(pdf_path=pdf_path)
# else:
segments = PDFReader.__get_pages_as_segments(pdf_path=pdf_path, apply=apply)
# If cv is to be used, create the appropriate settings and augment
......@@ -230,6 +233,8 @@ class PDFReader(AbstractFilereader):
filereader_args["model_prefix_path"] = None
if "normalize" not in filereader_args:
filereader_args["normalize"] = True
if "clean_text_boxes" not in filereader_args:
filereader_args["clean_text_boxes"] = True
@staticmethod
def __normalize_string(text: str) -> str:
......@@ -417,6 +422,9 @@ class PDFReader(AbstractFilereader):
with fitz.open(filename=pdf_path) as doc:
meta = doc.metadata
# save path in meta data, so it can be used by the TextBlockCleaner
meta["path"] = pdf_path
# Move "title" to "original_title" and determine the new "title" from
# pdf_path.
meta["original_title"] = meta["title"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment