Add podgrab featureset

2025-06-16 22:55:39 -07:00 · 2025-06-16 22:55:39 -07:00 · 233dd5b5c0
commit 233dd5b5c0
parent 095bf52a2f
33 changed files with 2315 additions and 125 deletions
--- a/app/services/opml_handler.py
+++ b/app/services/opml_handler.py
@ -0,0 +1,155 @@
+"""
+OPML import/export functionality for Podcastrr.
+"""
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+import logging
+from datetime import datetime
+from flask import current_app
+
+# Set up logging
+logger = logging.getLogger(__name__)
+
+def parse_opml(opml_content):
+    """
+    Parse OPML content and extract podcast feed URLs.
+
+    Args:
+        opml_content (str): OPML file content.
+
+    Returns:
+        list: List of dictionaries containing podcast information.
+    """
+    try:
+        root = ET.fromstring(opml_content)
+        
+        # Find all outline elements that represent podcasts
+        podcasts = []
+        
+        # Look for outlines in the body
+        body = root.find('body')
+        if body is None:
+            logger.error("OPML file has no body element")
+            return []
+            
+        # Process all outline elements
+        for outline in body.findall('.//outline'):
+            # Check if this is a podcast outline (has xmlUrl attribute)
+            xml_url = outline.get('xmlUrl')
+            if xml_url:
+                podcast = {
+                    'feed_url': xml_url,
+                    'title': outline.get('title') or outline.get('text', 'Unknown Podcast'),
+                    'description': outline.get('description', ''),
+                    'html_url': outline.get('htmlUrl', '')
+                }
+                podcasts.append(podcast)
+                
+        logger.info(f"Parsed OPML file and found {len(podcasts)} podcasts")
+        return podcasts
+    except Exception as e:
+        logger.error(f"Error parsing OPML file: {str(e)}")
+        return []
+
+def generate_opml(podcasts):
+    """
+    Generate OPML content from a list of podcasts.
+
+    Args:
+        podcasts (list): List of Podcast model instances.
+
+    Returns:
+        str: OPML file content.
+    """
+    try:
+        # Create the root element
+        root = ET.Element('opml')
+        root.set('version', '2.0')
+        
+        # Create the head element
+        head = ET.SubElement(root, 'head')
+        title = ET.SubElement(head, 'title')
+        title.text = 'Podcastrr Subscriptions'
+        date_created = ET.SubElement(head, 'dateCreated')
+        date_created.text = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
+        
+        # Create the body element
+        body = ET.SubElement(root, 'body')
+        
+        # Add each podcast as an outline element
+        for podcast in podcasts:
+            outline = ET.SubElement(body, 'outline')
+            outline.set('type', 'rss')
+            outline.set('text', podcast.title)
+            outline.set('title', podcast.title)
+            outline.set('xmlUrl', podcast.feed_url)
+            if podcast.description:
+                outline.set('description', podcast.description)
+                
+        # Convert to pretty-printed XML
+        xml_str = ET.tostring(root, encoding='utf-8')
+        parsed_xml = minidom.parseString(xml_str)
+        pretty_xml = parsed_xml.toprettyxml(indent="  ")
+        
+        logger.info(f"Generated OPML file with {len(podcasts)} podcasts")
+        return pretty_xml
+    except Exception as e:
+        logger.error(f"Error generating OPML file: {str(e)}")
+        return ""
+
+def import_podcasts_from_opml(opml_content):
+    """
+    Import podcasts from OPML content into the database.
+
+    Args:
+        opml_content (str): OPML file content.
+
+    Returns:
+        dict: Statistics about the import process.
+    """
+    from app.models.podcast import Podcast
+    from app.models.database import db
+    from app.services.podcast_updater import update_podcast
+    
+    podcasts = parse_opml(opml_content)
+    
+    stats = {
+        'total': len(podcasts),
+        'imported': 0,
+        'skipped': 0,
+        'errors': 0
+    }
+    
+    for podcast_data in podcasts:
+        try:
+            # Check if podcast already exists
+            existing = Podcast.query.filter_by(feed_url=podcast_data['feed_url']).first()
+            
+            if existing:
+                logger.info(f"Podcast already exists: {podcast_data['title']}")
+                stats['skipped'] += 1
+                continue
+                
+            # Create new podcast
+            podcast = Podcast(
+                title=podcast_data['title'],
+                description=podcast_data.get('description', ''),
+                feed_url=podcast_data['feed_url']
+            )
+            
+            db.session.add(podcast)
+            db.session.commit()
+            
+            # Update podcast to fetch episodes
+            try:
+                update_podcast(podcast.id)
+            except Exception as e:
+                logger.error(f"Error updating podcast {podcast.title}: {str(e)}")
+                
+            stats['imported'] += 1
+            logger.info(f"Imported podcast: {podcast.title}")
+        except Exception as e:
+            stats['errors'] += 1
+            logger.error(f"Error importing podcast: {str(e)}")
+            
+    return stats
--- a/app/services/podcast_downloader.py
+++ b/app/services/podcast_downloader.py
@ -173,6 +173,8 @@ def format_filename(format_string, podcast, episode):
            # If episode_number exists but is not a digit, format as S01E{episode_number}
            else f"S{episode.season or 1:02d}E{episode.episode_number}" 
            if episode.episode_number 
+            # If neither season nor episode_number are available, use published date
+            else episode.published_date.strftime('%Y-%m-%d') if episode.published_date
            # Otherwise, return empty string
            else ''
        ),
@ -195,10 +197,23 @@ def format_filename(format_string, podcast, episode):

    # Handle empty path segments by removing them
    path_parts = formatted_path.split(os.path.sep)
-    path_parts = [part for part in path_parts if part.strip()]
+
+    # Remove empty segments and segments that would be just placeholders without values
+    cleaned_parts = []
+    for part in path_parts:
+        part = part.strip()
+        if not part:
+            continue
+        # Check for common placeholders without values
+        if part in ["Season ", "Season", "Episode ", "Episode", "E", "S"]:
+            continue
+        # Check for patterns like "S01E" without an episode number
+        if part.startswith("S") and part.endswith("E") and len(part) > 2:
+            continue
+        cleaned_parts.append(part)

    # Rejoin the path with proper separators
-    return os.path.sep.join(path_parts)
+    return os.path.sep.join(cleaned_parts)

 def sanitize_filename(filename):
    """
@ -277,6 +292,7 @@ def delete_old_episodes(days=30):
 def verify_downloaded_episodes(podcast_id=None, progress_callback=None):
    """
    Verify that downloaded episodes still exist on disk and update their status.
+    Also checks for existing files for episodes that aren't marked as downloaded.

    Args:
        podcast_id (int, optional): ID of the podcast to check. If None, check all podcasts.
@ -286,23 +302,24 @@ def verify_downloaded_episodes(podcast_id=None, progress_callback=None):
        dict: Statistics about the verification process.
    """
    from app.models.podcast import Episode, Podcast
+    from app.models.settings import Settings

-    # Get episodes to check
+    # First, verify episodes that are marked as downloaded
    query = Episode.query.filter(Episode.downloaded == True)
    if podcast_id:
        query = query.filter(Episode.podcast_id == podcast_id)

-    episodes = query.all()
-    total = len(episodes)
+    downloaded_episodes = query.all()
+    total_downloaded = len(downloaded_episodes)

    if progress_callback:
-        progress_callback(0, f"Verifying {total} downloaded episodes")
+        progress_callback(0, f"Verifying {total_downloaded} downloaded episodes")

    missing = 0
-    for i, episode in enumerate(episodes):
-        if progress_callback and total > 0:
-            progress = int((i / total) * 100)
-            progress_callback(progress, f"Verifying episode {i+1}/{total}")
+    for i, episode in enumerate(downloaded_episodes):
+        if progress_callback and total_downloaded > 0:
+            progress = int((i / total_downloaded) * 50)  # Use first half of progress for verification
+            progress_callback(progress, f"Verifying episode {i+1}/{total_downloaded}")

        if not episode.file_path or not os.path.exists(episode.file_path):
            episode.downloaded = False
@ -312,15 +329,133 @@ def verify_downloaded_episodes(podcast_id=None, progress_callback=None):

    db.session.commit()

-    if progress_callback:
-        progress_callback(100, f"Verification complete. {missing} episodes marked as not downloaded.")
+    # Now check for existing files for episodes that aren't marked as downloaded
+    query = Episode.query.filter(Episode.downloaded == False)
+    if podcast_id:
+        query = query.filter(Episode.podcast_id == podcast_id)

-    logger.info(f"Verified {total} episodes. {missing} were missing.")
+    undownloaded_episodes = query.all()
+    total_undownloaded = len(undownloaded_episodes)
+
+    if progress_callback:
+        progress_callback(50, f"Checking for existing files for {total_undownloaded} undownloaded episodes")
+
+    found = 0
+    if total_undownloaded > 0 and podcast_id:
+        # Get the podcast
+        podcast = Podcast.query.get(podcast_id)
+        if not podcast:
+            logger.error(f"Podcast with ID {podcast_id} not found")
+            return {
+                'total_checked': total_downloaded,
+                'missing': missing,
+                'found': 0
+            }
+
+        # Get settings
+        settings = Settings.query.first()
+        if not settings:
+            settings = Settings(
+                download_path=current_app.config['DOWNLOAD_PATH'],
+                naming_format="{podcast_title}/{episode_title}"
+            )
+            db.session.add(settings)
+            db.session.commit()
+
+        # Use podcast's naming format if available, otherwise use global settings
+        naming_format = podcast.naming_format or settings.naming_format
+        download_path = settings.download_path
+
+        # Check each undownloaded episode for existing files
+        for i, episode in enumerate(undownloaded_episodes):
+            if progress_callback:
+                progress = 50 + int((i / total_undownloaded) * 50)  # Use second half of progress for file matching
+                progress_callback(progress, f"Checking for file for episode {i+1}/{total_undownloaded}")
+
+            try:
+                # Format filename using the naming format
+                filename = format_filename(naming_format, podcast, episode)
+
+                # Check for common audio file extensions
+                extensions = ['.mp3', '.m4a', '.ogg', '.wav']
+                for ext in extensions:
+                    file_path = os.path.normpath(os.path.join(download_path, filename + ext))
+                    if os.path.exists(file_path):
+                        logger.info(f"Found existing file for episode: {file_path}")
+                        episode.downloaded = True
+                        episode.file_path = file_path
+                        found += 1
+                        break
+            except Exception as e:
+                logger.error(f"Error checking for existing file for episode {episode.title}: {str(e)}")
+
+        db.session.commit()
+
+    if progress_callback:
+        progress_callback(100, f"Verification complete. {missing} episodes marked as not downloaded, {found} files matched.")
+
+    logger.info(f"Verified {total_downloaded} episodes. {missing} were missing. Found files for {found} undownloaded episodes.")
    return {
-        'total_checked': total,
-        'missing': missing
+        'total_checked': total_downloaded,
+        'missing': missing,
+        'found': found
    }

+def download_all_episodes(podcast_id, progress_callback=None):
+    """
+    Download all episodes of a podcast that haven't been downloaded yet.
+
+    Args:
+        podcast_id: ID of the Podcast to download all episodes for.
+        progress_callback (callable, optional): Callback function for progress updates.
+
+    Returns:
+        dict: Statistics about the download process.
+    """
+    from app.models.podcast import Podcast, Episode
+
+    if progress_callback:
+        progress_callback(2, "Loading podcast data")
+
+    # Load the podcast
+    podcast = Podcast.query.get(podcast_id)
+    if not podcast:
+        raise ValueError(f"Podcast with ID {podcast_id} not found")
+
+    # Get all episodes that haven't been downloaded yet
+    episodes = Episode.query.filter_by(podcast_id=podcast_id, downloaded=False).all()
+    total_episodes = len(episodes)
+
+    if progress_callback:
+        progress_callback(5, f"Found {total_episodes} episodes to download")
+
+    if total_episodes == 0:
+        if progress_callback:
+            progress_callback(100, "No episodes to download")
+        return {"total": 0, "downloaded": 0, "failed": 0}
+
+    stats = {"total": total_episodes, "downloaded": 0, "failed": 0}
+
+    # Download each episode
+    for i, episode in enumerate(episodes):
+        if progress_callback:
+            progress = 5 + int((i / total_episodes) * 90)  # Scale from 5% to 95%
+            progress_callback(progress, f"Downloading episode {i+1}/{total_episodes}: {episode.title}")
+
+        try:
+            download_episode(episode.id)
+            stats["downloaded"] += 1
+            logger.info(f"Downloaded episode {i+1}/{total_episodes}: {episode.title}")
+        except Exception as e:
+            stats["failed"] += 1
+            logger.error(f"Error downloading episode {episode.title}: {str(e)}")
+
+    if progress_callback:
+        progress_callback(100, f"Download complete. Downloaded {stats['downloaded']} episodes, {stats['failed']} failed.")
+
+    logger.info(f"Podcast archive download completed: {stats}")
+    return stats
+
 def rename_episode(episode_id, new_format=None, progress_callback=None):
    """
    Rename a downloaded episode file using a new format.
--- a/app/services/podcast_search.py
+++ b/app/services/podcast_search.py
@ -142,15 +142,126 @@ def get_podcast_episodes(feed_url):
                'published_date': _parse_date(entry.get('published')),
                'guid': entry.get('id', ''),
                'duration': _parse_duration(entry.get('itunes_duration', '')),
-                'season': entry.get('itunes_season'),  # Season number
-                'episode_number': entry.get('itunes_episode', ''),  # Episode number within season
+                'season': None,  # Default to None
+                'episode_number': None,  # Default to None, will try to extract from various sources
                'explicit': False  # Default to False
            }

-            # Handle explicit flag safely
-            itunes_explicit = entry.get('itunes_explicit', '')
-            if isinstance(itunes_explicit, str) and itunes_explicit:
-                episode['explicit'] = itunes_explicit.lower() == 'yes'
+            # Handle season tag - try multiple ways to access it
+            try:
+                # Try as attribute first
+                if hasattr(entry, 'itunes_season'):
+                    episode['season'] = int(entry.itunes_season) if entry.itunes_season else None
+                    logger.debug(f"Found season as attribute: {episode['season']}")
+                # Try as dictionary key
+                elif entry.get('itunes_season'):
+                    episode['season'] = int(entry.get('itunes_season')) if entry.get('itunes_season') else None
+                    logger.debug(f"Found season as dict key: {episode['season']}")
+                # Try looking in tags
+                elif hasattr(entry, 'tags'):
+                    for tag in entry.tags:
+                        if tag.get('term', '').startswith('Season'):
+                            try:
+                                episode['season'] = int(tag.get('term').replace('Season', '').strip())
+                                logger.debug(f"Found season in tags: {episode['season']}")
+                                break
+                            except (ValueError, TypeError):
+                                pass
+            except Exception as e:
+                logger.warning(f"Error parsing season: {str(e)}")
+
+            # Handle episode number - try multiple ways to access it
+            try:
+                # Try as attribute first (itunes_episode)
+                if hasattr(entry, 'itunes_episode') and entry.itunes_episode:
+                    episode['episode_number'] = entry.itunes_episode
+                    logger.debug(f"Found episode number as attribute: {episode['episode_number']}")
+                # Try as dictionary key
+                elif entry.get('itunes_episode'):
+                    episode['episode_number'] = entry.get('itunes_episode')
+                    logger.debug(f"Found episode number as dict key: {episode['episode_number']}")
+                # Try to extract from title if it contains "Episode X" or "Ep X" or "#X"
+                elif episode['title']:
+                    import re
+                    # Common patterns for episode numbers in titles
+                    patterns = [
+                        r'Episode\s+(\d+)',  # "Episode 123"
+                        r'Ep\s*(\d+)',       # "Ep123" or "Ep 123"
+                        r'#(\d+)',           # "#123"
+                        r'E(\d+)',           # "E123" or "S1E123"
+                    ]
+
+                    for pattern in patterns:
+                        match = re.search(pattern, episode['title'], re.IGNORECASE)
+                        if match:
+                            episode['episode_number'] = match.group(1)
+                            logger.debug(f"Extracted episode number from title: {episode['episode_number']}")
+                            break
+            except Exception as e:
+                logger.warning(f"Error parsing episode number: {str(e)}")
+
+            # Handle explicit flag - try multiple ways to access it
+            try:
+                # Try as attribute first
+                if hasattr(entry, 'itunes_explicit'):
+                    explicit_value = entry.itunes_explicit
+                    if isinstance(explicit_value, str):
+                        episode['explicit'] = explicit_value.lower() in ('yes', 'true')
+                        logger.debug(f"Found explicit as attribute: {episode['explicit']}")
+                # Try as dictionary key
+                elif entry.get('itunes_explicit'):
+                    explicit_value = entry.get('itunes_explicit')
+                    if isinstance(explicit_value, str):
+                        episode['explicit'] = explicit_value.lower() in ('yes', 'true')
+                        logger.debug(f"Found explicit as dict key: {episode['explicit']}")
+            except Exception as e:
+                logger.warning(f"Error parsing explicit flag: {str(e)}")
+
+            # Handle the different combinations of season and episode numbers
+            # Case 1: No season, no episode - use published date to create a sequential order
+            if episode['season'] is None and (episode['episode_number'] is None or episode['episode_number'] == ''):
+                if episode['published_date']:
+                    # Use the publication date to create a pseudo-episode number
+                    # Format: YYYYMMDD (e.g., 20230101 for January 1, 2023)
+                    episode['episode_number'] = episode['published_date'].strftime('%Y%m%d')
+                    logger.debug(f"No season or episode number, using date as episode number: {episode['episode_number']}")
+                else:
+                    # If no publication date, use a placeholder
+                    episode['episode_number'] = "unknown"
+                    logger.debug("No season, episode number, or date available")
+
+            # Case 2: No season, but episode number exists - keep episode number as is
+            elif episode['season'] is None and episode['episode_number'] is not None:
+                logger.debug(f"Using episode number without season: {episode['episode_number']}")
+
+            # Case 3: Season exists, no episode number - use season as prefix for ordering
+            elif episode['season'] is not None and (episode['episode_number'] is None or episode['episode_number'] == ''):
+                if episode['published_date']:
+                    # Use the publication date with season prefix
+                    # Format: S01_YYYYMMDD
+                    episode['episode_number'] = f"S{episode['season']:02d}_{episode['published_date'].strftime('%Y%m%d')}"
+                    logger.debug(f"Season without episode number, using season+date: {episode['episode_number']}")
+                else:
+                    # If no publication date, use season with unknown suffix
+                    episode['episode_number'] = f"S{episode['season']:02d}_unknown"
+                    logger.debug(f"Season without episode number or date: {episode['episode_number']}")
+
+            # Case 4: Both season and episode exist - format as S01E02
+            elif episode['season'] is not None and episode['episode_number'] is not None:
+                # Check if episode_number is already formatted as S01E02
+                import re
+                if not re.match(r'^S\d+E\d+$', str(episode['episode_number']), re.IGNORECASE):
+                    try:
+                        # Try to convert episode_number to integer for proper formatting
+                        ep_num = int(episode['episode_number'])
+                        episode['episode_number'] = f"S{episode['season']:02d}E{ep_num:02d}"
+                        logger.debug(f"Formatted season and episode as: {episode['episode_number']}")
+                    except (ValueError, TypeError):
+                        # If episode_number can't be converted to int, use as is with season prefix
+                        episode['episode_number'] = f"S{episode['season']:02d}_{episode['episode_number']}"
+                        logger.debug(f"Using season prefix with non-numeric episode: {episode['episode_number']}")
+                else:
+                    logger.debug(f"Episode already formatted correctly: {episode['episode_number']}")

            # Generate a GUID if one is not provided
            if not episode['guid']:
--- a/app/services/podcast_updater.py
+++ b/app/services/podcast_updater.py
@ -128,20 +128,60 @@ def update_podcast(podcast_id, progress_callback=None):
                        published_date=episode_data.get('published_date'),
                        duration=episode_data.get('duration'),
                        file_size=episode_data.get('file_size'),
+                        season=episode_data.get('season'),  # Season number
                        episode_number=episode_data.get('episode_number'),
                        guid=episode_data['guid'],
-                        downloaded=False
+                        downloaded=False,
+                        explicit=episode_data.get('explicit')  # Explicit flag
                    )

                    db.session.add(episode)
                    stats['new_episodes'] += 1
                    logger.info(f"Added new episode: {episode.title}")

-                    # Auto-download if enabled
-                    if podcast.auto_download and episode.audio_url:
-                        try:
-                            # Need to commit first to ensure episode has an ID
+                    # Need to commit first to ensure episode has an ID
+                    db.session.commit()
+
+                    # Check if file already exists for this episode
+                    try:
+                        from app.services.podcast_downloader import format_filename
+                        import os
+                        from app.models.settings import Settings
+
+                        settings = Settings.query.first()
+                        if not settings:
+                            settings = Settings(
+                                download_path=current_app.config['DOWNLOAD_PATH'],
+                                naming_format="{podcast_title}/{episode_title}"
+                            )
+                            db.session.add(settings)
                            db.session.commit()
+
+                        # Use podcast's naming format if available, otherwise use global settings
+                        naming_format = podcast.naming_format or settings.naming_format
+
+                        # Format filename using the naming format
+                        filename = format_filename(naming_format, podcast, episode)
+                        download_path = settings.download_path
+
+                        # Check for common audio file extensions
+                        extensions = ['.mp3', '.m4a', '.ogg', '.wav']
+                        for ext in extensions:
+                            file_path = os.path.normpath(os.path.join(download_path, filename + ext))
+                            if os.path.exists(file_path):
+                                logger.info(f"Found existing file for episode: {file_path}")
+                                episode.downloaded = True
+                                episode.file_path = file_path
+                                db.session.commit()
+                                break
+
+                        logger.info(f"Checked for existing files for episode: {episode.title}")
+                    except Exception as e:
+                        logger.error(f"Error checking for existing files for episode {episode.title}: {str(e)}")
+
+                    # Auto-download if enabled and not already downloaded
+                    if podcast.auto_download and episode.audio_url and not episode.downloaded:
+                        try:
                            download_episode(episode.id)
                            stats['episodes_downloaded'] += 1
                            logger.info(f"Auto-downloaded episode: {episode.title}")
--- a/app/services/task_manager.py
+++ b/app/services/task_manager.py
@ -172,12 +172,12 @@ class TaskManager:
        with self.lock:
            return list(self.tasks.values())

-    def clean_old_tasks(self, max_age_seconds=60):
+    def clean_old_tasks(self, max_age_seconds=86400):
        """
        Remove old completed or failed tasks.

        Args:
-            max_age_seconds (int): Maximum age of tasks to keep in seconds
+            max_age_seconds (int): Maximum age of tasks to keep in seconds (default: 24 hours)

        Returns:
            int: Number of tasks removed