-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsummarize_book.py
More file actions
81 lines (63 loc) · 2.64 KB
/
summarize_book.py
File metadata and controls
81 lines (63 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import argparse
import os
from pathlib import Path
from dotenv import load_dotenv
from epub_extractor import extract
from epub_processor import process_epub
from send_prompts import PromptProcessor
def summarize_book(epub_path: str, out_dir: str | None):
# Defaults optimizados para capítulos largos
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
OUTPUT_TOKENS = int(os.getenv("OPENAI_OUTPUT_MAX_TOKENS", "800"))
REQ_DELAY = float(os.getenv("OPENAI_REQUEST_DELAY_SEC", "0.5")) # Más delay para estabilidad
# 1) Extract
extract(epub_path)
# 2) Process to text
epub = Path(epub_path)
books_folder = Path('books')
book_folder = books_folder / "".join([c for c in epub.stem if c.isalpha() or c.isdigit() or c==' ']).rstrip()
if not book_folder.exists():
raise FileNotFoundError(f"Processed book folder not found: {book_folder}")
process_epub(book_folder, character_limit=999999999)
# 3) Summarize chapters only into a flat responses folder
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY not set. Add it to .env or env vars.")
# Determine output dir
responses_dir = Path(out_dir) if out_dir else (book_folder / "responses")
responses_dir.mkdir(parents=True, exist_ok=True)
processor = PromptProcessor(
book_path=str(book_folder),
api_key=api_key,
model=MODEL,
output_max_tokens=OUTPUT_TOKENS,
request_delay_sec=REQ_DELAY,
responses_dir_override=str(responses_dir),
)
processor.process_files()
# 4) Global book summary from chapter summaries
pieces = []
for md in sorted(responses_dir.glob('ch*_summary.md')):
try:
text = md.read_text(encoding='utf-8')
except Exception:
continue
pieces.append(f"\n\n## Source: {md.name}\n\n{text}\n")
if pieces:
big_prompt = "\n\n".join(pieces)
# Reuse processor client with capped tokens
content = processor.send_prompt(big_prompt)
(responses_dir / 'BOOK_summary.md').write_text(content, encoding='utf-8')
print(f"Global summary generated at: {responses_dir / 'BOOK_summary.md'}")
def main():
load_dotenv()
parser = argparse.ArgumentParser(description="Summarize an EPUB into Spanish chapter summaries + global summary.")
parser.add_argument("epub_path", help="Path to .epub file")
parser.add_argument("out_dir", nargs='?', default=None, help="Optional output directory for summaries")
args = parser.parse_args()
summarize_book(
epub_path=args.epub_path,
out_dir=args.out_dir,
)
if __name__ == "__main__":
main()