diff --git a/2._Web-Crawling/scrape_websites.py b/2._Web-Crawling/scrape_websites.py index b3230d13315284b2c6b93c6c9c41bbd5d4e952b3..a75bf8603c87f121ca6c1430023c75956577b5f9 100644 --- a/2._Web-Crawling/scrape_websites.py +++ b/2._Web-Crawling/scrape_websites.py @@ -46,7 +46,7 @@ def scrape_websites(file_url_list, file_output): # das Heruntergeladene wird von trafilatura brauchbar gemacht # Output-Format ist XML, Kommentare inkludiert, und nur Seiten auf Deutsch werden gescrapt - website_content_cleaned = extract(website_content, output_format='xml', include_comments=True, + website_content_cleaned = extract(website_content, with_metadata=True, output_format='xml', include_comments=True, target_language='de') if website_content_cleaned is None: # falls die Extraktion nicht ausgeführt werden kann diff --git a/3._Tokenization_und_PoS_Tagging/xml_converter.py b/3._Tokenization_und_PoS_Tagging/xml_converter.py index ced768711eecb21286444728a28dbb7662053ff5..5a1e5c6b897ad6284194c10cba0a5c9280a655a0 100644 --- a/3._Tokenization_und_PoS_Tagging/xml_converter.py +++ b/3._Tokenization_und_PoS_Tagging/xml_converter.py @@ -27,35 +27,35 @@ def xml_converter(in_file, out_file): corpus_root = promethia_root.getroot() for index, doc_element in enumerate(trafi_root.iter("doc")): - - # Create a 'text' element and add it to the 'corpus' + # Create 'text' element text = ET.SubElement(corpus_root, "text") - # Create a 'header' element and add it to 'text' + + # Create 'header' element header = ET.SubElement(text, "header") - - # Create 'id' and 'title' elements and add them to 'header' - id_element = ET.SubElement(header, "id") - id_element.text = str(index) - # Create 'date' element and add it to 'header' - date_element = ET.SubElement(header, "date") - if doc_element.get("date"): - date_element.text = doc_element.get("date") + # Get all attributes present in the current document + doc_attribs = doc_element.attrib.keys() - title_element = ET.SubElement(header, "title") - if doc_element.get("title"): - title_element.text = doc_element.get("title") + # Loop through all attributes dynamically + for field in doc_attribs: + element = ET.SubElement(header, field) + element.text = doc_element.get(field) - doc_content = doc_element.find("main") - # Create a 'body' element and add it to 'text' - body = ET.SubElement(text, "body") - body.text = ET.tostring(doc_content, method="text", encoding='Unicode') # Replace with your text content + # Manually add 'id' since it's not in the document attributes + id_element = ET.SubElement(header, "id") + id_element.text = str(index) - # Save the XML to a file - output_file_path = out_file - promethia_root.write( - output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8" - ) + # Create 'body' element + body = ET.SubElement(text, "body") + doc_content = doc_element.find("main") + if doc_content is not None: + body.text = ET.tostring(doc_content, method="text", encoding='Unicode') + + # Save the XML to a file + output_file_path = out_file + promethia_root.write( + output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8" + ) print(f"XML data saved to {output_file_path}") print("Done")