new parser
This commit is contained in:
168
parse.py
168
parse.py
@@ -1,9 +1,10 @@
|
||||
import enum
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import re
|
||||
import json
|
||||
|
||||
entry_re = re.compile(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', re.MULTILINE)
|
||||
entry_re = re.compile(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ?', re.MULTILINE)
|
||||
|
||||
curr_day = ''
|
||||
|
||||
@@ -143,25 +144,137 @@ entry_modules = {
|
||||
'notify': create_entry_module_parser('notify', parse_notify),
|
||||
}
|
||||
|
||||
from functools import reduce, partial
|
||||
|
||||
def split_keep(delims, string):
|
||||
res = []
|
||||
buf = []
|
||||
for c in string:
|
||||
if c in delims:
|
||||
if buf:
|
||||
res.append(''.join(buf))
|
||||
res.append(c)
|
||||
buf = []
|
||||
else:
|
||||
buf.append(c)
|
||||
if buf:
|
||||
res.append(''.join(buf))
|
||||
return res
|
||||
|
||||
assert split_keep(['@', '\n'], 'hello @world\n\nabout') == ['hello ', '@', 'world', '\n', '\n', 'about']
|
||||
|
||||
def merge_chars(chars, l):
|
||||
res = []
|
||||
for i in l:
|
||||
if i in chars and res and all(c == i for c in res[-1]):
|
||||
res[-1] += i
|
||||
else:
|
||||
res.append(i)
|
||||
return res
|
||||
|
||||
assert merge_chars('\n', ['\n', '\n', 'hello', 'world', '\n', '\n']) == ['\n\n', 'hello', 'world', '\n\n']
|
||||
|
||||
def attach_to_next(c, l):
|
||||
l = l.copy()
|
||||
|
||||
try:
|
||||
while True:
|
||||
i = l.index(c)
|
||||
l[i+1] = c + l[i+1]
|
||||
l.pop(i)
|
||||
except:
|
||||
pass
|
||||
|
||||
return l
|
||||
|
||||
assert attach_to_next('@', ['aoeu', '@', 'oeu']) == ['aoeu', '@oeu']
|
||||
|
||||
def attach_to_prev_if(pred, l):
|
||||
res = []
|
||||
|
||||
for i, curr in enumerate(l):
|
||||
prev = l[i-1] if i-1 >= 0 else None
|
||||
if prev and pred(prev, curr):
|
||||
res[-1] += curr
|
||||
else:
|
||||
res.append(curr)
|
||||
|
||||
return res
|
||||
|
||||
assert attach_to_prev_if(lambda p, c: p[-1] != '\n' and c[0] == '@', ['aoeu', '@oeu']) == ['aoeu@oeu']
|
||||
|
||||
def merge_notes_block(l):
|
||||
res = []
|
||||
|
||||
i = 0
|
||||
while i < len(l):
|
||||
if l[i] == '@notes':
|
||||
# notes nl source nl title
|
||||
res.append('\n'.join([l[i], l[i+2], l[i+4]]))
|
||||
i += 5
|
||||
else:
|
||||
res.append(l[i])
|
||||
i += 1
|
||||
|
||||
return res
|
||||
|
||||
def merge_wrapped_lines(l):
|
||||
res = []
|
||||
|
||||
i = 0
|
||||
while i < len(l):
|
||||
curr = l[i]
|
||||
prev = l[i-1] if i > 0 else None
|
||||
next = l[i+1] if i+1 < len(l) else None
|
||||
|
||||
if prev and next and curr == '\n':
|
||||
len_prev = len(prev)
|
||||
if i == 1:
|
||||
len_prev += len('2020-02-02 02:02:02 ')
|
||||
|
||||
if not next[0].isspace():
|
||||
next_word = next.split()[0]
|
||||
if len_prev + len(next_word) >= 80:
|
||||
res[-1] += ' ' + next
|
||||
i += 2
|
||||
continue
|
||||
|
||||
res.append(curr)
|
||||
i += 1
|
||||
|
||||
return res
|
||||
|
||||
def apply(f, x):
|
||||
return f(x)
|
||||
|
||||
def flip(f):
|
||||
return lambda a1, a2: f(a2, a1)
|
||||
|
||||
def parse_entry(entry):
|
||||
result = {}
|
||||
|
||||
def split_into_blocks(text):
|
||||
result = []
|
||||
|
||||
for block in re.split(r'\n{2,}', text):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
|
||||
for i, module in enumerate(block.replace(' @', '\n@').split('\n@')):
|
||||
#module = module.strip().replace('\n', ' ')
|
||||
if i == 0:
|
||||
result.append(module)
|
||||
else:
|
||||
result.append('@'+module)
|
||||
|
||||
return result
|
||||
r = reduce(flip(apply), [
|
||||
# split the text into sections by newline and tag symbol, keeping the separators
|
||||
partial(split_keep, ('\n', '@')),
|
||||
# merge sequential newlines together into a single whitespace block
|
||||
partial(merge_chars, '\n'),
|
||||
# attach escaped tag symbols
|
||||
partial(attach_to_prev_if, lambda p, c: c == '@' and p[-1] == '\\'),
|
||||
# attach tag symbols
|
||||
partial(attach_to_next, '@'),
|
||||
# ???
|
||||
partial(attach_to_prev_if, lambda p, c: p[-1] != '\n' and not (p[0] == '@' and p[-1] == ' ') and c[0] == '@'),
|
||||
# yes
|
||||
merge_notes_block,
|
||||
# strip all non-whitespace blocks
|
||||
partial(map, lambda s: s if s.isspace() else s.rstrip()), list,
|
||||
# yes
|
||||
merge_wrapped_lines,
|
||||
# remove trailing whitespace block
|
||||
lambda b: b if b and not all(c == '\n' for c in b[-1]) else b[:-1],
|
||||
], text)
|
||||
return r
|
||||
|
||||
timestamp, content = entry
|
||||
|
||||
@@ -169,7 +282,7 @@ def parse_entry(entry):
|
||||
result['blocks'] = []
|
||||
|
||||
for b in split_into_blocks(content):
|
||||
if b[0] == '@':
|
||||
if b.startswith('@'):
|
||||
tag = b.split()[0][1:]
|
||||
result['blocks'].append(entry_modules[tag](b))
|
||||
else:
|
||||
@@ -177,20 +290,23 @@ def parse_entry(entry):
|
||||
|
||||
return result
|
||||
|
||||
result = {}
|
||||
|
||||
for fpath in list(sorted((Path.home() / 'workspace' / 'journal').glob('*.md'))):
|
||||
curr_day = fpath.stem
|
||||
|
||||
header, *tmp = entry_re.split(fpath.read_text())
|
||||
def parse_page(text):
|
||||
header, *tmp = entry_re.split(text)
|
||||
entries = list(zip(tmp[::2], tmp[1::2]))
|
||||
|
||||
result[fpath.stem] = {
|
||||
return {
|
||||
'header': parse_header(header),
|
||||
'entries': [parse_entry(e) for e in entries],
|
||||
}
|
||||
|
||||
script_path = Path(__file__).parent
|
||||
if __name__ == '__main__':
|
||||
result = {}
|
||||
|
||||
with open(script_path / 'journal.json', 'w') as fp:
|
||||
for fpath in list(sorted((Path.home() / 'workspace' / 'journal').glob('*.md'))):
|
||||
day = parse_page(fpath.read_text())
|
||||
result[fpath.stem] = day
|
||||
|
||||
script_path = Path(__file__).parent
|
||||
|
||||
with open(script_path / 'journal.json', 'w') as fp:
|
||||
json.dump(result, fp, indent=4, ensure_ascii=False)
|
||||
|
||||
Reference in New Issue
Block a user