Spaces:

ResearchEngineering
/

financial_news_bot

Sleeping

App Files Files Community

Dmitry Beresnev commited on 2 days ago

Commit

9a265da

1 Parent(s): 5260ec0

add cache for the downloaded data

Browse files

Files changed (3) hide show

src/core/ticker_scanner/__init__.py +7 -0
src/core/ticker_scanner/parallel_data_downloader.py +27 -61
src/core/ticker_scanner/ticker_cache.py +70 -0

src/core/ticker_scanner/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Ticker Scanner Module
 Monitors and analyzes stock tickers for growth potential
 """
 from src.core.ticker_scanner.ticker_analyzer import TickerAnalyzer
@@ -8,6 +9,10 @@ from src.core.ticker_scanner.scheduler import Scheduler
 from src.core.ticker_scanner.growth_speed_analyzer import GrowthSpeedAnalyzer
 from src.core.ticker_scanner.core_enums import StockExchange, GrowthCategory
 from src.core.ticker_scanner.growth_metrics import GrowthSpeedMetrics
 __all__ = [
     'TickerAnalyzer',
@@ -16,4 +21,6 @@ __all__ = [
     'StockExchange',
     'GrowthCategory',
     'GrowthSpeedMetrics',
 ]

 """
 Ticker Scanner Module
 Monitors and analyzes stock tickers for growth potential
+Includes in-memory caching with 2-hour expiry
 """
 from src.core.ticker_scanner.ticker_analyzer import TickerAnalyzer
 from src.core.ticker_scanner.growth_speed_analyzer import GrowthSpeedAnalyzer
 from src.core.ticker_scanner.core_enums import StockExchange, GrowthCategory
 from src.core.ticker_scanner.growth_metrics import GrowthSpeedMetrics
+from src.core.ticker_scanner.parallel_data_downloader import (
+    clear_cache,
+    get_cache_stats
+)
 __all__ = [
     'TickerAnalyzer',
     'StockExchange',
     'GrowthCategory',
     'GrowthSpeedMetrics',
+    'clear_cache',
+    'get_cache_stats',
 ]

src/core/ticker_scanner/parallel_data_downloader.py CHANGED Viewed

@@ -9,7 +9,6 @@ import time
 import random
 from itertools import islice
 from typing import Any, Optional
-from datetime import datetime, timedelta
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import yfinance as yf
@@ -17,6 +16,7 @@ import yfinance as yf
 from src.core.ticker_scanner.core_enums import StockExchange
 from src.core.ticker_scanner.tickers_provider import TickersProvider
 from src.telegram_bot.logger import main_logger as logger
 MAX_WORKERS = 8                 # Number of parallel processes
@@ -24,75 +24,35 @@ MAX_RETRIES = 3                 # Retry count on failure
 SLEEP_BETWEEN_RETRIES = 1.0     # Seconds between retries
 BATCH_SIZE = 50                 # Number of tickers per batch
 MIN_DATA_POINTS = 50            # Minimum number of price points required
-CACHE_EXPIRY_HOURS = 2          # Cache expiry time in hours
-# In-memory cache for ticker data
-_ticker_cache: dict[str, dict[str, Any]] = {}
-_cache_timestamps: dict[str, datetime] = {}
-def _is_cache_valid(ticker: str) -> bool:
-    """Check if cached data for ticker is still valid (not expired)"""
-    if ticker not in _cache_timestamps:
-        return False
-    cache_age = datetime.now() - _cache_timestamps[ticker]
-    return cache_age < timedelta(hours=CACHE_EXPIRY_HOURS)
-def _get_cached_data(ticker: str) -> Optional[dict[str, Any]]:
-    """Get cached data if valid, None otherwise"""
-    if _is_cache_valid(ticker):
-        logger.debug(f"Using cached data for {ticker}")
-        return _ticker_cache.get(ticker)
-    return None
-def _cache_data(ticker: str, data: dict[str, Any]) -> None:
-    """Cache ticker data with current timestamp"""
-    _ticker_cache[ticker] = data
-    _cache_timestamps[ticker] = datetime.now()
-    logger.debug(f"Cached data for {ticker}")
 def clear_cache() -> None:
     """Clear all cached data (useful for testing or manual refresh)"""
-    global _ticker_cache, _cache_timestamps
-    _ticker_cache.clear()
-    _cache_timestamps.clear()
-    logger.info("Cache cleared")
 def get_cache_stats() -> dict[str, Any]:
     """Get cache statistics"""
-    valid_count = sum(1 for ticker in _ticker_cache.keys() if _is_cache_valid(ticker))
-    return {
-        'total_cached': len(_ticker_cache),
-        'valid_cached': valid_count,
-        'expired_cached': len(_ticker_cache) - valid_count
-    }
-def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES, use_cache: bool = True) -> dict[str, Any]:
     """
     Download all-time closing prices for a single ticker safely.
-    Uses in-memory cache if available and not expired.
     Args:
         ticker: Stock ticker symbol
         max_retries: Maximum number of retry attempts
-        use_cache: Whether to use cached data if available
     Returns:
         dict {'ticker': ticker, 'prices': ndarray, 'dates': DatetimeIndex} or None if failed
     """
-    # Check cache first
-    if use_cache:
-        cached_data = _get_cached_data(ticker)
-        if cached_data is not None:
-            return cached_data
-    # Download fresh data
     for attempt in range(max_retries):
         try:
             df = yf.download(ticker, period="max", progress=False, auto_adjust=True)
@@ -123,18 +83,12 @@ def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES, use_cache: bool =
             if prices.ndim > 1:
                 prices = prices.flatten()
-            result = {
                 "ticker": ticker,
                 "prices": prices,
                 "dates": dates
             }
-            # Cache the result
-            if use_cache:
-                _cache_data(ticker, result)
-            return result
         except yf.shared.YFRateLimitError:
             wait = SLEEP_BETWEEN_RETRIES + random.random()
             logger.warning(f"Rate limited for {ticker}. Waiting {wait:.1f}s and retrying...")
@@ -155,7 +109,8 @@ def batch(iterable: list[str], n: int = BATCH_SIZE):
             break
         yield chunk
-def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS,
                             use_cache: bool = True) -> list[dict[str, Any]]:
     """
     Download a large list of tickers in parallel batches.
@@ -163,6 +118,7 @@ def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS
     Args:
         tickers: List of ticker symbols to download
         max_workers: Number of parallel workers
         use_cache: Whether to use cached data
@@ -175,7 +131,7 @@ def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS
     if use_cache:
         for ticker in tickers:
-            cached_data = _get_cached_data(ticker)
             if cached_data:
                 cached_results.append(cached_data)
             else:
@@ -194,7 +150,7 @@ def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS
         logger.info(f"Downloading {len(tickers_to_download)} tickers...")
         for batch_num, ticker_batch in enumerate(batch(tickers_to_download, BATCH_SIZE), start=1):
             logger.info(f"Processing batch {batch_num}: {len(ticker_batch)} tickers")
-            results, failed = process_batch(ticker_batch, max_workers)
             all_results.extend(results)
             all_failed.extend(failed)
             # small sleep between batches to reduce rate-limit chance
@@ -206,20 +162,30 @@ def download_tickers_parallel(tickers: list[str], max_workers: int = MAX_WORKERS
     return all_results
-def process_batch(ticker_batch: list[str], max_workers: int) -> tuple[list[dict[str, Any]], list[Any]]:
     """
     Process a batch of tickers in parallel using multiprocessing.
     Returns tuple (successful_results, failed_tickers)
     """
     results = []
     failed = []
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
-        futures = {executor.submit(fetch_prices, t): t for t in ticker_batch}
         for future in as_completed(futures):
             ticker = futures[future]
             try:
                 res = future.result()
                 if res:
                     results.append(res)
                 else:
                     failed.append(ticker)
@@ -249,7 +215,7 @@ def run_parallel_data_downloader(exchange: StockExchange = StockExchange.NASDAQ,
     logger.info(f"Cache stats: {cache_stats['valid_cached']} valid, {cache_stats['expired_cached']} expired")
     logger.info(f"Starting download for {len(tickers)} tickers from {exchange.value}...")
-    data = download_tickers_parallel(tickers, use_cache=use_cache)
     logger.info(f"Retrieved {len(data)} tickers successfully")
     return data

 import random
 from itertools import islice
 from typing import Any, Optional
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import yfinance as yf
 from src.core.ticker_scanner.core_enums import StockExchange
 from src.core.ticker_scanner.tickers_provider import TickersProvider
 from src.telegram_bot.logger import main_logger as logger
+from src.core.ticker_scanner.ticker_cache import TickerCache
 MAX_WORKERS = 8                 # Number of parallel processes
 SLEEP_BETWEEN_RETRIES = 1.0     # Seconds between retries
 BATCH_SIZE = 50                 # Number of tickers per batch
 MIN_DATA_POINTS = 50            # Minimum number of price points required
+# Global cache instance
+_cache = TickerCache()
 def clear_cache() -> None:
     """Clear all cached data (useful for testing or manual refresh)"""
+    _cache.clear()
 def get_cache_stats() -> dict[str, Any]:
     """Get cache statistics"""
+    return _cache.get_stats()
+def fetch_prices(ticker: str, max_retries: int = MAX_RETRIES, use_cache: bool = False) -> Optional[dict[str, Any]]:
     """
     Download all-time closing prices for a single ticker safely.
     Args:
         ticker: Stock ticker symbol
         max_retries: Maximum number of retry attempts
+        use_cache: Whether to use cached data (NOTE: typically False when called from subprocess)
     Returns:
         dict {'ticker': ticker, 'prices': ndarray, 'dates': DatetimeIndex} or None if failed
     """
+    # Download fresh data (cache is handled in main process)
     for attempt in range(max_retries):
         try:
             df = yf.download(ticker, period="max", progress=False, auto_adjust=True)
             if prices.ndim > 1:
                 prices = prices.flatten()
+            return {
                 "ticker": ticker,
                 "prices": prices,
                 "dates": dates
             }
         except yf.shared.YFRateLimitError:
             wait = SLEEP_BETWEEN_RETRIES + random.random()
             logger.warning(f"Rate limited for {ticker}. Waiting {wait:.1f}s and retrying...")
             break
         yield chunk
+def download_tickers_parallel(tickers: list[str], exchange: str,
+                            max_workers: int = MAX_WORKERS,
                             use_cache: bool = True) -> list[dict[str, Any]]:
     """
     Download a large list of tickers in parallel batches.
     Args:
         tickers: List of ticker symbols to download
+        exchange: Exchange name (e.g., "NASDAQ", "NYSE")
         max_workers: Number of parallel workers
         use_cache: Whether to use cached data
     if use_cache:
         for ticker in tickers:
+            cached_data = _cache.get(exchange, ticker)
             if cached_data:
                 cached_results.append(cached_data)
             else:
         logger.info(f"Downloading {len(tickers_to_download)} tickers...")
         for batch_num, ticker_batch in enumerate(batch(tickers_to_download, BATCH_SIZE), start=1):
             logger.info(f"Processing batch {batch_num}: {len(ticker_batch)} tickers")
+            results, failed = process_batch(ticker_batch, exchange, max_workers)
             all_results.extend(results)
             all_failed.extend(failed)
             # small sleep between batches to reduce rate-limit chance
     return all_results
+def process_batch(ticker_batch: list[str], exchange: str, max_workers: int) -> tuple[list[dict[str, Any]], list[Any]]:
     """
     Process a batch of tickers in parallel using multiprocessing.
     Returns tuple (successful_results, failed_tickers)
+    Args:
+        ticker_batch: List of ticker symbols to process
+        exchange: Exchange name for cache key
+        max_workers: Number of parallel workers
+    Note: Downloads always fetch fresh data (cache checked before this step)
     """
     results = []
     failed = []
     with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        # Don't use cache in subprocess - already handled in main process
+        futures = {executor.submit(fetch_prices, t, use_cache=False): t for t in ticker_batch}
         for future in as_completed(futures):
             ticker = futures[future]
             try:
                 res = future.result()
                 if res:
+                    # Cache the result in the main process after download
+                    _cache.set(exchange, res['ticker'], res)
                     results.append(res)
                 else:
                     failed.append(ticker)
     logger.info(f"Cache stats: {cache_stats['valid_cached']} valid, {cache_stats['expired_cached']} expired")
     logger.info(f"Starting download for {len(tickers)} tickers from {exchange.value}...")
+    data = download_tickers_parallel(tickers, exchange.value, use_cache=use_cache)
     logger.info(f"Retrieved {len(data)} tickers successfully")
     return data

src/core/ticker_scanner/ticker_cache.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Any, Optional
+from datetime import datetime, timedelta
+from src.telegram_bot.logger import main_logger as logger
+CACHE_EXPIRY_HOURS = 2          # Cache expiry time in hours
+class TickerCache:
+    """
+    In-memory cache for ticker data with automatic expiry.
+    Uses exchange:ticker as key to support multiple exchanges.
+    """
+    def __init__(self, expiry_hours: int = CACHE_EXPIRY_HOURS):
+        self._cache: dict[str, dict[str, Any]] = {}
+        self._timestamps: dict[str, datetime] = {}
+        self._expiry_hours = expiry_hours
+    def _make_key(self, exchange: str, ticker: str) -> str:
+        """Create cache key from exchange and ticker"""
+        return f"{exchange}:{ticker}"
+    def is_valid(self, exchange: str, ticker: str) -> bool:
+        """Check if cached data is still valid (not expired)"""
+        key = self._make_key(exchange, ticker)
+        if key not in self._timestamps:
+            return False
+        cache_age = datetime.now() - self._timestamps[key]
+        return cache_age < timedelta(hours=self._expiry_hours)
+    def get(self, exchange: str, ticker: str) -> Optional[dict[str, Any]]:
+        """Get cached data if valid, None otherwise"""
+        if self.is_valid(exchange, ticker):
+            key = self._make_key(exchange, ticker)
+            logger.debug(f"Using cached data for {key}")
+            return self._cache.get(key)
+        return None
+    def set(self, exchange: str, ticker: str, data: dict[str, Any]) -> None:
+        """Cache ticker data with timestamp"""
+        key = self._make_key(exchange, ticker)
+        self._cache[key] = data
+        self._timestamps[key] = datetime.now()
+        logger.debug(f"Cached data for {key}")
+    def clear(self) -> None:
+        """Clear all cached data"""
+        self._cache.clear()
+        self._timestamps.clear()
+        logger.info("Cache cleared")
+    def get_stats(self) -> dict[str, Any]:
+        """Get cache statistics"""
+        valid_count = 0
+        for key in self._cache.keys():
+            # Parse key to get exchange and ticker
+            parts = key.split(':', 1)
+            if len(parts) == 2:
+                exchange, ticker = parts
+                if self.is_valid(exchange, ticker):
+                    valid_count += 1
+        return {
+            'total_cached': len(self._cache),
+            'valid_cached': valid_count,
+            'expired_cached': len(self._cache) - valid_count
+        }