tanos-japanese-word-books/python/htmlToTex.py

36 lines
1.2 KiB
Python
Raw Permalink Normal View History

2022-02-08 01:39:19 +01:00
#!/usr/bin/env python3
from lxml import etree;
from re import sub
for n in reversed(range(1,6)):
with open(f'./data/html/n{n}.html', 'r') as file:
doc = etree.parse(file)
def extractCellText(col):
a = col.xpath('a')
if len(a) == 0: return ''
return sub(r'(-?\d+(?:\.\d+)?\^-?\d+(?:\.\d+)?)', r'$\1$', a[0].text.replace('#', '\\#'))
rows = (tuple(map(extractCellText, tr.xpath("td"))) for tr in doc.xpath("//tbody/tr"))
# Skip header
next(rows)
# Make cell with multiple rows in latex if there's multiple meanings/readings
def makeMultiCellIfMultipleEntries(cellText, rowtype = 'j'):
if rowtype == 'j' and '/' in cellText:
return '\\makecell[l]{ %s }' % cellText.replace('/', ' \\\\ ')
elif rowtype == 'e' and ',' in cellText:
return '\\makecell[l]{ %s }' % cellText.replace(',', ' \\\\ ')
else:
return cellText
2022-02-08 12:02:51 +01:00
with open(f'build/texdata/n{n}.tex', 'w') as file:
2022-02-08 01:39:19 +01:00
file.write('\\begin{longtabu} to \\textwidth {ll|l}\n')
file.write(
" \\\\\\hline\n".join(f'{makeMultiCellIfMultipleEntries(row[0])} & {makeMultiCellIfMultipleEntries(row[1])} & {makeMultiCellIfMultipleEntries(row[2], rowtype="e")}' for row in rows)
)
file.write('\n\\end{longtabu}')