From b7245e1ef9b562f22dbf2e3d6ce6cf95bf61f6a2 Mon Sep 17 00:00:00 2001 From: lifegpc Date: Sat, 25 Jun 2022 18:43:51 +0800 Subject: [PATCH] add split_novel --- split_novel.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 split_novel.py diff --git a/split_novel.py b/split_novel.py new file mode 100644 index 0000000..8917c6b --- /dev/null +++ b/split_novel.py @@ -0,0 +1,41 @@ +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from re import compile +from os.path import join, splitext +from os import makedirs + + +arg = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +arg.add_argument('PATH', help='The path to input file.') +arg.add_argument('-o', '--output', help="The path to output directory.", + metavar='OUTPUT', dest='output') +arg.add_argument('-e', '--encoding', help='File encoding.', + default='UTF-8', metavar='ENCODING', dest='encoding') +arg.add_argument('-r', '--regex', help='Chapter detect regex.', + default=r'^第 ?\d+ ?章', metavar='REGEX', dest='regex') +args = arg.parse_intermixed_args() +print(args) +regex = compile(args.regex) +if args.output is None: + args.output = splitext(args.PATH)[0] +makedirs(args.output, exist_ok=True) +cur = [] +ind = 1 +with open(args.PATH, 'r', encoding=args.encoding) as f: + while True: + now = f.readline() + if not now.endswith('\n'): + break + now = now.rstrip('\n') + if regex.search(now) is not None: + print(now, len(cur)) + if len(cur): + with open(join(args.output, f'{ind:06}.txt'), 'w', encoding=args.encoding) as o: + o.write('\n'.join(cur)) + cur = [] + ind += 1 + cur.append(now) +if len(cur): + with open(join(args.output, f'{ind:06}.txt'), 'w', encoding=args.encoding) as o: + o.write('\n'.join(cur)) + cur = [] + ind += 1