diff --git a/split_novel.py b/split_novel.py new file mode 100644 index 0000000..8917c6b --- /dev/null +++ b/split_novel.py @@ -0,0 +1,41 @@ +from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser +from re import compile +from os.path import join, splitext +from os import makedirs + + +arg = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +arg.add_argument('PATH', help='The path to input file.') +arg.add_argument('-o', '--output', help="The path to output directory.", + metavar='OUTPUT', dest='output') +arg.add_argument('-e', '--encoding', help='File encoding.', + default='UTF-8', metavar='ENCODING', dest='encoding') +arg.add_argument('-r', '--regex', help='Chapter detect regex.', + default=r'^第 ?\d+ ?章', metavar='REGEX', dest='regex') +args = arg.parse_intermixed_args() +print(args) +regex = compile(args.regex) +if args.output is None: + args.output = splitext(args.PATH)[0] +makedirs(args.output, exist_ok=True) +cur = [] +ind = 1 +with open(args.PATH, 'r', encoding=args.encoding) as f: + while True: + now = f.readline() + if not now.endswith('\n'): + break + now = now.rstrip('\n') + if regex.search(now) is not None: + print(now, len(cur)) + if len(cur): + with open(join(args.output, f'{ind:06}.txt'), 'w', encoding=args.encoding) as o: + o.write('\n'.join(cur)) + cur = [] + ind += 1 + cur.append(now) +if len(cur): + with open(join(args.output, f'{ind:06}.txt'), 'w', encoding=args.encoding) as o: + o.write('\n'.join(cur)) + cur = [] + ind += 1