PyXML minidom parsing problems
Problēma ir sekojoša. Ir XML fails:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE xliff PUBLIC "-//XLIFF//DTD XLIFF//EN" "http://www.oasis-open.org/committees/xliff/documents/xliff.dtd">
<xliff xmlns="urn:oasis:names:tc:xliff:document:1.2" xmlns:logoport="urn:logoport:xliffeditor:xliff-extras:1.0" xmlns:idiom_temp="urn:logoport:xliffeditor:xliff-idiom-temp:1.0" xmlns:iws="http://www.idiominc.com/ws/asset">
<file datatype="plaintext" source-language="EN" original="MultiLingualContent.fla" target-language="lv-lv">
<header/>
<body>
<trans-unit resname="IDS_1" id="001">
<source>
sourcesource<ph x="START_TAG" id="1" logoport:equiv-text="START_TAG">{1}</ph>sourcesource
</source>
<target logoport:matchpercent="0" state="translated">
targettarget<ph x="START_TAG" id="1" logoport:equiv-text="START_TAG">{1}</ph>targettarget
</target>
</trans-unit>
<trans-unit resname="IDS_2" id="002">
<source>
sourcesource<ph x="START_TAG" id="2" logoport:equiv-text="START_TAG">{2}</ph>sourcesource
</source>
<target logoport:matchpercent="0" state="translated">
targettarget<ph x="START_TAG" id="2" logoport:equiv-text="START_TAG">{2}</ph>targettarget
</target>
</trans-unit>
<trans-unit resname="IDS_3" id="003">
<source>
sourcesource<ph x="START_TAG" id="3" logoport:equiv-text="START_TAG">{3}</ph>sourcesource
</source>
<target logoport:matchpercent="0" state="translated">
targettarget<ph x="START_TAG" id="3" logoport:equiv-text="START_TAG">{3}</ph>targettarget
</target>
</trans-unit>
<trans-unit resname="IDS_4" id="004">
<source>
sourcesource<ph x="START_TAG" id="4" logoport:equiv-text="START_TAG">{4}</ph>sourcesource
</source>
<target logoport:matchpercent="0" state="translated">
targettarget<ph x="START_TAG" id="4" logoport:equiv-text="START_TAG">{4}</ph>targettarget
</target>
</trans-unit>
</body>
</file>
</xliff>
Ir Python skripts, ar kura palīdzību katras
<target>...</target> virknes beigās pievienot newline, goes like this:
#!/usr/bin/env python
import sys
import getopt
from xml.dom import minidom
def usage():
print >> sys.stderr, 'Usage: parse.py --file file_name'
def ferror():
print >> sys.stderr, 'Cannot open file'
sys.exit(2)
def main(argv):
if len(argv) != 1:
usage()
sys.exit(2)
try:
opts, args = getopt.getopt(argv, "hf:", ["help", "file="])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-f", "--file"):
fileName = arg
else:
assert False, "unhandled option"
try:
dom = minidom.parse(fileName)
file = open(fileName, 'w')
except:
ferror()
rc = ''
for node in dom.getElementsByTagName('target'):
if node.firstChild.nodeType == minidom.Node.TEXT_NODE:
rc = node.firstChild.data + '\n'
node.firstChild.data = rc
file.write(dom.toxml('utf-8'))
file.close()
if __name__ == "__main__":
main(sys.argv[1:])
Problēma sastāv no apstākļa, ka tāda vai citāda iemesla dēļ XML parseris apstājas pie
<ph>...</ph> taga un ievieto newline tur, ignorējot atlikušo virknes daļu.