Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
KoDuP-Germanistik
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Noah Bubenhofer
KoDuP-Germanistik
Commits
3f56020d
Commit
3f56020d
authored
1 year ago
by
Sonja Huber
Browse files
Options
Downloads
Patches
Plain Diff
adapt extract_comments.py to update on xml_extractor.py
parent
07da08dc
Branches
update_extr_comments
Branches containing commit
No related tags found
1 merge request
!15
adapt extract_comments.py to update on xml_extractor.py
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
3._Tokenization_und_PoS_Tagging/extract_comments.py
+43
-79
43 additions, 79 deletions
3._Tokenization_und_PoS_Tagging/extract_comments.py
with
43 additions
and
79 deletions
3._Tokenization_und_PoS_Tagging/extract_comments.py
+
43
−
79
View file @
3f56020d
...
...
@@ -3,7 +3,7 @@ Skript um aus einem Trafilatura-XML-File die Kommentare passend für Promethia z
Input:
1. Name der Inputdatei im XML-Format, von Trafilatura gescrapt
2. Name der Outputdatei im XML-Format
2. Name der Outputdatei im XML-Format
, kann von Promethia verarbeitet werden
Output:
XML-Datei mit den Kommentaren in passendem Format für Promethia.
...
...
@@ -11,91 +11,55 @@ Falls schon etwas in der Datei steht, wird der Inhalt per default überschrieben
"""
import
argparse
import
re
from
xml_converter
import
check_for_tags
def
write_comments
(
out_file
,
id_count
,
title
,
comments
):
"""
Schreibe Kommentare in das (bereits geöffnete) Outputfile
"""
out_file
.
write
(
"
\t
<text>
\n
"
)
out_file
.
write
(
"
\t\t
<header>
\n
"
)
out_file
.
write
(
"
\t\t\t
<id>
"
+
str
(
id_count
)
+
"
</id>
\n
"
)
out_file
.
write
(
"
\t\t\t
<title>
"
+
title
.
group
()
+
"
</title>
\n
"
)
out_file
.
write
(
"
\t\t
</header>
\n
"
)
out_file
.
write
(
"
\t\t
<body>
\n
"
)
for
line
in
comments
:
out_file
.
write
(
"
\t\t\t
"
+
check_for_tags
(
line
)
+
"
\n
"
)
out_file
.
write
(
"
\t\t
</body>
\n
"
)
out_file
.
write
(
"
\t
</text>
\n
"
)
#from xml_converter import check_for_tags
import
lxml.etree
as
ET
def
extract_comments
(
in_file
,
out_file
):
# Input- und Outputfile werden geöffnet
with
open
(
out_file
,
"
w
"
,
encoding
=
"
utf-8
"
)
as
f_out
:
# Start des xml-files
f_out
.
write
(
"
<?xml version=
\"
1.0
\"
encoding=
\"
UTF-8
\"
?>
\n
"
)
f_out
.
write
(
"
<corpus>
\n
"
)
with
open
(
in_file
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f_in
:
id_count
=
1
meta_line
=
f_in
.
readline
()
# Bearbeite eine Seite
while
meta_line
is
not
None
:
if
meta_line
.
strip
()
==
""
:
break
if
meta_line
.
strip
()
==
"
</doc>
"
:
break
# Extrahiere Seitentitel
site_title
=
re
.
search
(
"
(?<=title=
\"
)[^
\"
|»]+
"
,
meta_line
)
# Navigiere zu den Kommentaren
no_comments
=
False
while
True
:
line
=
f_in
.
readline
()
if
"
<comments>
"
in
line
:
# Kommentare erreicht
break
elif
"
<comments/>
"
in
line
:
# Keine Kommentare
no_comments
=
True
break
# wenn die Seite keine Kommentare hat, gehe zur nächsten Seite
if
no_comments
:
meta_line
=
f_in
.
readline
()
continue
# Lese Kommentare
comments
=
[]
line
=
f_in
.
readline
()
while
"
</comments>
"
not
in
line
:
# Ende erreicht?
# XXX: momentan wird die von Trafilatura generierte Anzahl Kommentare ignoriert
if
"
</head>
"
in
line
:
# Ersetze den String "X Comments" damit er später nicht das Korpus dominiert
line
.
replace
(
line
,
""
)
else
:
comments
.
append
(
check_for_tags
(
line
.
strip
()))
line
=
f_in
.
readline
()
# Gebe Kommentare aus
write_comments
(
f_out
,
id_count
,
site_title
,
comments
)
# Gehe zur nächsten Seite
id_count
+=
1
meta_line
=
f_in
.
readline
()
f_out
.
write
(
"
</corpus>
"
)
print
(
"
printed no2;id =
"
+
str
(
id_count
))
# Parse the XML file
trafi_tree
=
ET
.
parse
(
in_file
)
# Get the root element
trafi_root
=
trafi_tree
.
getroot
()
# Create a new ElementTree
promethia_root
=
ET
.
ElementTree
(
ET
.
Element
(
"
corpus
"
))
# Get the root element
corpus_root
=
promethia_root
.
getroot
()
for
index
,
doc_element
in
enumerate
(
trafi_root
.
iter
(
"
doc
"
)):
# if comment exists, make a entry
doc_content
=
doc_element
.
find
(
"
comments
"
)
test_string
=
ET
.
tostring
(
doc_content
,
method
=
"
text
"
,
encoding
=
'
Unicode
'
)
if
test_string
is
not
'
\n
'
:
# Create a 'text' element and add it to the 'corpus'
text
=
ET
.
SubElement
(
corpus_root
,
"
text
"
)
# Create a 'header' element and add it to 'text'
header
=
ET
.
SubElement
(
text
,
"
header
"
)
# Create 'id' and 'title' elements and add them to 'header'
id_element
=
ET
.
SubElement
(
header
,
"
id
"
)
id_element
.
text
=
str
(
index
)
# Replace with your ID value
doc_title
=
doc_element
.
get
(
"
title
"
)
title_element
=
ET
.
SubElement
(
header
,
"
title
"
)
title_element
.
text
=
doc_title
# Replace with your title value
# Create a 'body' element and add it to 'text'
body
=
ET
.
SubElement
(
text
,
"
body
"
)
body
.
text
=
ET
.
tostring
(
doc_content
,
method
=
"
text
"
,
encoding
=
'
Unicode
'
)
# Replace with your text content
print
(
f
'
&/
{
body
.
text
}
/&
'
)
# Save the XML to a file
output_file_path
=
out_file
promethia_root
.
write
(
output_file_path
,
pretty_print
=
True
,
xml_declaration
=
True
,
encoding
=
"
utf-8
"
)
print
(
f
"
XML data saved to
{
output_file_path
}
"
)
print
(
"
Done
"
)
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
Kommentar-Extraktor
"
)
parser
.
add_argument
(
"
in_file
"
,
type
=
str
,
help
=
"
Input-Datei (als Trafilatura-XML). Enthält alles.
"
)
parser
.
add_argument
(
"
out_file
"
,
type
=
str
,
help
=
"
Output-Datei (als
Trafilatur
a-XML). Enthält nur Kommentare.
"
)
parser
.
add_argument
(
"
out_file
"
,
type
=
str
,
help
=
"
Output-Datei (als
Promethi
a-XML). Enthält nur Kommentare.
"
)
args
=
parser
.
parse_args
()
extract_comments
(
args
.
in_file
,
args
.
out_file
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment