diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 0872911..a6f5963 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -3,7 +3,7 @@ class Settings(BaseSettings): - database_url: str = "postgresql+asyncpg://postgres:postgres@localhost:5432/summarizerdb" + database_url: str = "sqlite+aiosqlite:///./summarizerdb.db" secret_key: str = "your-secret-key-here-change-in-production" algorithm: str = "HS256" access_token_expire_minutes: int = 30 diff --git a/backend/app/core/security.py b/backend/app/core/security.py index 0318d5e..d504e8a 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -1,6 +1,6 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Optional -from jose import JWTError, jwt +import jwt from passlib.context import CryptContext from app.core.config import settings @@ -10,9 +10,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None): to_encode = data.copy() if expires_delta: - expire = datetime.utcnow() + expires_delta + expire = datetime.now(timezone.utc) + expires_delta else: - expire = datetime.utcnow() + timedelta(minutes=15) + expire = datetime.now(timezone.utc) + timedelta(minutes=15) to_encode.update({"exp": expire}) encoded_jwt = jwt.encode(to_encode, settings.secret_key, algorithm=settings.algorithm) return encoded_jwt @@ -33,5 +33,5 @@ def verify_token(token: str) -> Optional[str]: if username is None: return None return username - except JWTError: + except jwt.InvalidTokenError: return None \ No newline at end of file diff --git a/backend/app/core/summarizer.py b/backend/app/core/summarizer.py index bd74259..ab8c32f 100644 --- a/backend/app/core/summarizer.py +++ b/backend/app/core/summarizer.py @@ -7,8 +7,8 @@ from collections import Counter, namedtuple from operator import attrgetter -import spacy -from spacy.lang.en.stop_words import STOP_WORDS +# import spacy +# from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation import nltk from newspaper import Article @@ -48,11 +48,11 @@ summarizer.stop_words = get_stop_words(LANGUAGE) # Load spacy model -try: - nlp = spacy.load("en_core_web_sm") -except OSError: - logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") - nlp = None +# try: +# nlp = spacy.load("en_core_web_sm") +# except OSError: +# logger.warning("Spacy model 'en_core_web_sm' not found. Please install it with: python -m spacy download en_core_web_sm") +nlp = None SentenceInfo = namedtuple("SentenceInfo", ("sentence", "order", "rates",)) @@ -70,16 +70,19 @@ def download_text(url: str) -> Article: return article -def get_significant_words_list(doc) -> List[str]: - """Get a list of important words (PROPN; ADJ; NOUN; VERB) excluding stop words and punctuation""" +def get_significant_words_list(text: str) -> List[str]: + """Get a list of important words excluding stop words and punctuation""" + # Simplified version without spaCy + import re words = [] - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - for token in doc: - if (token.text in stopwords or token.text in punctuation): - continue - if (token.pos_ in pos_tag): - words.append(token.text) + # Basic stop words + stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'} + + # Simple word extraction + words_raw = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + for word in words_raw: + if word not in stopwords and word not in punctuation and len(word) > 2: + words.append(word) return words @@ -94,16 +97,18 @@ def get_frequency_words(words: List[str]) -> Counter: return freq_word -def get_sent_strength(doc, freq_word: Counter) -> Dict: +def get_sent_strength(sentences: List[str], freq_word: Counter) -> Dict: """Get sentence importance scores based on word frequencies""" sent_strength = {} - for sent in doc.sents: - for word in sent: - if word.text in freq_word.keys(): - if sent in sent_strength.keys(): - sent_strength[sent] += freq_word[word.text] - else: - sent_strength[sent] = freq_word[word.text] + import re + + for sent in sentences: + words = re.findall(r'\b[a-zA-Z]+\b', sent.lower()) + score = 0 + for word in words: + if word in freq_word: + score += freq_word[word] + sent_strength[sent] = score return sent_strength @@ -115,29 +120,13 @@ def get_extractive_summary(sent_strength: Dict, n_sents: int = 5): infos = sorted(infos, key=attrgetter("rates"), reverse=True)[:n_sents] infos = sorted(infos, key=attrgetter("order")) logger.info(f"Extracted {len(infos)} sentences ...") - return tuple(i.sentence.text for i in infos) + return tuple(i.sentence for i in infos) -def extractive_summary_pipeline(doc: str, n_sents: int = 5) -> str: - """Generate extractive summary using spacy pipeline""" - if not nlp: - return extractive_summary_lsa(doc, n_sents) - - doc = nlp(doc) - logger.info(f"Starting to compute summary from {len(list(doc.sents))} sentences ...") - words = get_significant_words_list(doc) - freq_word = get_frequency_words(words) - sent_strength = get_sent_strength(doc, freq_word) - - summaries = get_extractive_summary(sent_strength, n_sents=n_sents) - if not summaries: - return extractive_summary_lsa(doc.text, n_sents) - - start_sentence = list(doc.sents)[0].text - total_summary = ' '.join(summaries) - if start_sentence in summaries: - return total_summary - return start_sentence + ' ' + total_summary +def extractive_summary_pipeline(text: str, n_sents: int = 5) -> str: + """Generate extractive summary using simplified pipeline""" + # Always use LSA for now since spaCy is disabled + return extractive_summary_lsa(text, n_sents) def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: @@ -155,45 +144,28 @@ def extractive_summary_lsa(text: str, n_sents: int = 5) -> str: def get_nest_sentences(document: str, tokenizer: AutoTokenizer, token_max_length: int = 1024) -> List[str]: """Split document into chunks with maximum token length""" - if not nlp: - # Simple sentence splitting fallback - sentences = document.split('.') - chunks = [] - current_chunk = "" - - for sentence in sentences: - test_chunk = current_chunk + sentence + "." - tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] - - if len(tokens) <= token_max_length: - current_chunk = test_chunk - else: - if current_chunk: - chunks.append(current_chunk) - current_chunk = sentence + "." - - if current_chunk: - chunks.append(current_chunk) - - return chunks + # Simple sentence splitting fallback + sentences = document.split('.') + chunks = [] + current_chunk = "" - sents = [] - length = 0 - doc = nlp(document) - s = '' - for sentence in doc.sents: - tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)['input_ids'] - length += len(tokens_in_sentence) - if length <= token_max_length: - s += sentence.text + for sentence in sentences: + if not sentence.strip(): + continue + test_chunk = current_chunk + sentence + "." + tokens = tokenizer(test_chunk, truncation=False, padding=False)['input_ids'] + + if len(tokens) <= token_max_length: + current_chunk = test_chunk else: - sents.append(s) - s = sentence.text - length = len(tokens_in_sentence) + if current_chunk: + chunks.append(current_chunk) + current_chunk = sentence + "." + + if current_chunk: + chunks.append(current_chunk) - # Append last string - if s: - sents.append(s) + sents = chunks logger.info(f'Returning {len(sents)} number of chunk strings') return sents diff --git a/backend/requirements.txt b/backend/requirements.txt index 06a3ac9..3b48fbe 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,10 +12,12 @@ pytest-asyncio==0.21.1 pytest-cov==4.1.0 torch==2.5.1 transformers==4.40.0 -spacy==3.7.6 +spacy==3.8.2 newspaper3k==0.2.8 +lxml_html_clean==0.4.2 sumy==0.11.0 -python-jose[cryptography]==3.3.0 +PyJWT==2.8.0 +cryptography==41.0.7 passlib[bcrypt]==1.7.4 python-dotenv==1.0.0 gunicorn==21.2.0 \ No newline at end of file