Add podgrab featureset

This commit is contained in:
Cody Cook 2025-06-16 22:55:39 -07:00
commit 233dd5b5c0
33 changed files with 2315 additions and 125 deletions

View file

@ -0,0 +1,155 @@
"""
OPML import/export functionality for Podcastrr.
"""
import xml.etree.ElementTree as ET
from xml.dom import minidom
import logging
from datetime import datetime
from flask import current_app
# Set up logging
logger = logging.getLogger(__name__)
def parse_opml(opml_content):
"""
Parse OPML content and extract podcast feed URLs.
Args:
opml_content (str): OPML file content.
Returns:
list: List of dictionaries containing podcast information.
"""
try:
root = ET.fromstring(opml_content)
# Find all outline elements that represent podcasts
podcasts = []
# Look for outlines in the body
body = root.find('body')
if body is None:
logger.error("OPML file has no body element")
return []
# Process all outline elements
for outline in body.findall('.//outline'):
# Check if this is a podcast outline (has xmlUrl attribute)
xml_url = outline.get('xmlUrl')
if xml_url:
podcast = {
'feed_url': xml_url,
'title': outline.get('title') or outline.get('text', 'Unknown Podcast'),
'description': outline.get('description', ''),
'html_url': outline.get('htmlUrl', '')
}
podcasts.append(podcast)
logger.info(f"Parsed OPML file and found {len(podcasts)} podcasts")
return podcasts
except Exception as e:
logger.error(f"Error parsing OPML file: {str(e)}")
return []
def generate_opml(podcasts):
"""
Generate OPML content from a list of podcasts.
Args:
podcasts (list): List of Podcast model instances.
Returns:
str: OPML file content.
"""
try:
# Create the root element
root = ET.Element('opml')
root.set('version', '2.0')
# Create the head element
head = ET.SubElement(root, 'head')
title = ET.SubElement(head, 'title')
title.text = 'Podcastrr Subscriptions'
date_created = ET.SubElement(head, 'dateCreated')
date_created.text = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
# Create the body element
body = ET.SubElement(root, 'body')
# Add each podcast as an outline element
for podcast in podcasts:
outline = ET.SubElement(body, 'outline')
outline.set('type', 'rss')
outline.set('text', podcast.title)
outline.set('title', podcast.title)
outline.set('xmlUrl', podcast.feed_url)
if podcast.description:
outline.set('description', podcast.description)
# Convert to pretty-printed XML
xml_str = ET.tostring(root, encoding='utf-8')
parsed_xml = minidom.parseString(xml_str)
pretty_xml = parsed_xml.toprettyxml(indent=" ")
logger.info(f"Generated OPML file with {len(podcasts)} podcasts")
return pretty_xml
except Exception as e:
logger.error(f"Error generating OPML file: {str(e)}")
return ""
def import_podcasts_from_opml(opml_content):
"""
Import podcasts from OPML content into the database.
Args:
opml_content (str): OPML file content.
Returns:
dict: Statistics about the import process.
"""
from app.models.podcast import Podcast
from app.models.database import db
from app.services.podcast_updater import update_podcast
podcasts = parse_opml(opml_content)
stats = {
'total': len(podcasts),
'imported': 0,
'skipped': 0,
'errors': 0
}
for podcast_data in podcasts:
try:
# Check if podcast already exists
existing = Podcast.query.filter_by(feed_url=podcast_data['feed_url']).first()
if existing:
logger.info(f"Podcast already exists: {podcast_data['title']}")
stats['skipped'] += 1
continue
# Create new podcast
podcast = Podcast(
title=podcast_data['title'],
description=podcast_data.get('description', ''),
feed_url=podcast_data['feed_url']
)
db.session.add(podcast)
db.session.commit()
# Update podcast to fetch episodes
try:
update_podcast(podcast.id)
except Exception as e:
logger.error(f"Error updating podcast {podcast.title}: {str(e)}")
stats['imported'] += 1
logger.info(f"Imported podcast: {podcast.title}")
except Exception as e:
stats['errors'] += 1
logger.error(f"Error importing podcast: {str(e)}")
return stats

View file

@ -173,6 +173,8 @@ def format_filename(format_string, podcast, episode):
# If episode_number exists but is not a digit, format as S01E{episode_number}
else f"S{episode.season or 1:02d}E{episode.episode_number}"
if episode.episode_number
# If neither season nor episode_number are available, use published date
else episode.published_date.strftime('%Y-%m-%d') if episode.published_date
# Otherwise, return empty string
else ''
),
@ -195,10 +197,23 @@ def format_filename(format_string, podcast, episode):
# Handle empty path segments by removing them
path_parts = formatted_path.split(os.path.sep)
path_parts = [part for part in path_parts if part.strip()]
# Remove empty segments and segments that would be just placeholders without values
cleaned_parts = []
for part in path_parts:
part = part.strip()
if not part:
continue
# Check for common placeholders without values
if part in ["Season ", "Season", "Episode ", "Episode", "E", "S"]:
continue
# Check for patterns like "S01E" without an episode number
if part.startswith("S") and part.endswith("E") and len(part) > 2:
continue
cleaned_parts.append(part)
# Rejoin the path with proper separators
return os.path.sep.join(path_parts)
return os.path.sep.join(cleaned_parts)
def sanitize_filename(filename):
"""
@ -277,6 +292,7 @@ def delete_old_episodes(days=30):
def verify_downloaded_episodes(podcast_id=None, progress_callback=None):
"""
Verify that downloaded episodes still exist on disk and update their status.
Also checks for existing files for episodes that aren't marked as downloaded.
Args:
podcast_id (int, optional): ID of the podcast to check. If None, check all podcasts.
@ -286,23 +302,24 @@ def verify_downloaded_episodes(podcast_id=None, progress_callback=None):
dict: Statistics about the verification process.
"""
from app.models.podcast import Episode, Podcast
from app.models.settings import Settings
# Get episodes to check
# First, verify episodes that are marked as downloaded
query = Episode.query.filter(Episode.downloaded == True)
if podcast_id:
query = query.filter(Episode.podcast_id == podcast_id)
episodes = query.all()
total = len(episodes)
downloaded_episodes = query.all()
total_downloaded = len(downloaded_episodes)
if progress_callback:
progress_callback(0, f"Verifying {total} downloaded episodes")
progress_callback(0, f"Verifying {total_downloaded} downloaded episodes")
missing = 0
for i, episode in enumerate(episodes):
if progress_callback and total > 0:
progress = int((i / total) * 100)
progress_callback(progress, f"Verifying episode {i+1}/{total}")
for i, episode in enumerate(downloaded_episodes):
if progress_callback and total_downloaded > 0:
progress = int((i / total_downloaded) * 50) # Use first half of progress for verification
progress_callback(progress, f"Verifying episode {i+1}/{total_downloaded}")
if not episode.file_path or not os.path.exists(episode.file_path):
episode.downloaded = False
@ -312,15 +329,133 @@ def verify_downloaded_episodes(podcast_id=None, progress_callback=None):
db.session.commit()
if progress_callback:
progress_callback(100, f"Verification complete. {missing} episodes marked as not downloaded.")
# Now check for existing files for episodes that aren't marked as downloaded
query = Episode.query.filter(Episode.downloaded == False)
if podcast_id:
query = query.filter(Episode.podcast_id == podcast_id)
logger.info(f"Verified {total} episodes. {missing} were missing.")
undownloaded_episodes = query.all()
total_undownloaded = len(undownloaded_episodes)
if progress_callback:
progress_callback(50, f"Checking for existing files for {total_undownloaded} undownloaded episodes")
found = 0
if total_undownloaded > 0 and podcast_id:
# Get the podcast
podcast = Podcast.query.get(podcast_id)
if not podcast:
logger.error(f"Podcast with ID {podcast_id} not found")
return {
'total_checked': total_downloaded,
'missing': missing,
'found': 0
}
# Get settings
settings = Settings.query.first()
if not settings:
settings = Settings(
download_path=current_app.config['DOWNLOAD_PATH'],
naming_format="{podcast_title}/{episode_title}"
)
db.session.add(settings)
db.session.commit()
# Use podcast's naming format if available, otherwise use global settings
naming_format = podcast.naming_format or settings.naming_format
download_path = settings.download_path
# Check each undownloaded episode for existing files
for i, episode in enumerate(undownloaded_episodes):
if progress_callback:
progress = 50 + int((i / total_undownloaded) * 50) # Use second half of progress for file matching
progress_callback(progress, f"Checking for file for episode {i+1}/{total_undownloaded}")
try:
# Format filename using the naming format
filename = format_filename(naming_format, podcast, episode)
# Check for common audio file extensions
extensions = ['.mp3', '.m4a', '.ogg', '.wav']
for ext in extensions:
file_path = os.path.normpath(os.path.join(download_path, filename + ext))
if os.path.exists(file_path):
logger.info(f"Found existing file for episode: {file_path}")
episode.downloaded = True
episode.file_path = file_path
found += 1
break
except Exception as e:
logger.error(f"Error checking for existing file for episode {episode.title}: {str(e)}")
db.session.commit()
if progress_callback:
progress_callback(100, f"Verification complete. {missing} episodes marked as not downloaded, {found} files matched.")
logger.info(f"Verified {total_downloaded} episodes. {missing} were missing. Found files for {found} undownloaded episodes.")
return {
'total_checked': total,
'missing': missing
'total_checked': total_downloaded,
'missing': missing,
'found': found
}
def download_all_episodes(podcast_id, progress_callback=None):
"""
Download all episodes of a podcast that haven't been downloaded yet.
Args:
podcast_id: ID of the Podcast to download all episodes for.
progress_callback (callable, optional): Callback function for progress updates.
Returns:
dict: Statistics about the download process.
"""
from app.models.podcast import Podcast, Episode
if progress_callback:
progress_callback(2, "Loading podcast data")
# Load the podcast
podcast = Podcast.query.get(podcast_id)
if not podcast:
raise ValueError(f"Podcast with ID {podcast_id} not found")
# Get all episodes that haven't been downloaded yet
episodes = Episode.query.filter_by(podcast_id=podcast_id, downloaded=False).all()
total_episodes = len(episodes)
if progress_callback:
progress_callback(5, f"Found {total_episodes} episodes to download")
if total_episodes == 0:
if progress_callback:
progress_callback(100, "No episodes to download")
return {"total": 0, "downloaded": 0, "failed": 0}
stats = {"total": total_episodes, "downloaded": 0, "failed": 0}
# Download each episode
for i, episode in enumerate(episodes):
if progress_callback:
progress = 5 + int((i / total_episodes) * 90) # Scale from 5% to 95%
progress_callback(progress, f"Downloading episode {i+1}/{total_episodes}: {episode.title}")
try:
download_episode(episode.id)
stats["downloaded"] += 1
logger.info(f"Downloaded episode {i+1}/{total_episodes}: {episode.title}")
except Exception as e:
stats["failed"] += 1
logger.error(f"Error downloading episode {episode.title}: {str(e)}")
if progress_callback:
progress_callback(100, f"Download complete. Downloaded {stats['downloaded']} episodes, {stats['failed']} failed.")
logger.info(f"Podcast archive download completed: {stats}")
return stats
def rename_episode(episode_id, new_format=None, progress_callback=None):
"""
Rename a downloaded episode file using a new format.

View file

@ -142,15 +142,126 @@ def get_podcast_episodes(feed_url):
'published_date': _parse_date(entry.get('published')),
'guid': entry.get('id', ''),
'duration': _parse_duration(entry.get('itunes_duration', '')),
'season': entry.get('itunes_season'), # Season number
'episode_number': entry.get('itunes_episode', ''), # Episode number within season
'season': None, # Default to None
'episode_number': None, # Default to None, will try to extract from various sources
'explicit': False # Default to False
}
# Handle explicit flag safely
itunes_explicit = entry.get('itunes_explicit', '')
if isinstance(itunes_explicit, str) and itunes_explicit:
episode['explicit'] = itunes_explicit.lower() == 'yes'
# Handle season tag - try multiple ways to access it
try:
# Try as attribute first
if hasattr(entry, 'itunes_season'):
episode['season'] = int(entry.itunes_season) if entry.itunes_season else None
logger.debug(f"Found season as attribute: {episode['season']}")
# Try as dictionary key
elif entry.get('itunes_season'):
episode['season'] = int(entry.get('itunes_season')) if entry.get('itunes_season') else None
logger.debug(f"Found season as dict key: {episode['season']}")
# Try looking in tags
elif hasattr(entry, 'tags'):
for tag in entry.tags:
if tag.get('term', '').startswith('Season'):
try:
episode['season'] = int(tag.get('term').replace('Season', '').strip())
logger.debug(f"Found season in tags: {episode['season']}")
break
except (ValueError, TypeError):
pass
except Exception as e:
logger.warning(f"Error parsing season: {str(e)}")
# Handle episode number - try multiple ways to access it
try:
# Try as attribute first (itunes_episode)
if hasattr(entry, 'itunes_episode') and entry.itunes_episode:
episode['episode_number'] = entry.itunes_episode
logger.debug(f"Found episode number as attribute: {episode['episode_number']}")
# Try as dictionary key
elif entry.get('itunes_episode'):
episode['episode_number'] = entry.get('itunes_episode')
logger.debug(f"Found episode number as dict key: {episode['episode_number']}")
# Try to extract from title if it contains "Episode X" or "Ep X" or "#X"
elif episode['title']:
import re
# Common patterns for episode numbers in titles
patterns = [
r'Episode\s+(\d+)', # "Episode 123"
r'Ep\s*(\d+)', # "Ep123" or "Ep 123"
r'#(\d+)', # "#123"
r'E(\d+)', # "E123" or "S1E123"
]
for pattern in patterns:
match = re.search(pattern, episode['title'], re.IGNORECASE)
if match:
episode['episode_number'] = match.group(1)
logger.debug(f"Extracted episode number from title: {episode['episode_number']}")
break
except Exception as e:
logger.warning(f"Error parsing episode number: {str(e)}")
# Handle explicit flag - try multiple ways to access it
try:
# Try as attribute first
if hasattr(entry, 'itunes_explicit'):
explicit_value = entry.itunes_explicit
if isinstance(explicit_value, str):
episode['explicit'] = explicit_value.lower() in ('yes', 'true')
logger.debug(f"Found explicit as attribute: {episode['explicit']}")
# Try as dictionary key
elif entry.get('itunes_explicit'):
explicit_value = entry.get('itunes_explicit')
if isinstance(explicit_value, str):
episode['explicit'] = explicit_value.lower() in ('yes', 'true')
logger.debug(f"Found explicit as dict key: {episode['explicit']}")
except Exception as e:
logger.warning(f"Error parsing explicit flag: {str(e)}")
# Handle the different combinations of season and episode numbers
# Case 1: No season, no episode - use published date to create a sequential order
if episode['season'] is None and (episode['episode_number'] is None or episode['episode_number'] == ''):
if episode['published_date']:
# Use the publication date to create a pseudo-episode number
# Format: YYYYMMDD (e.g., 20230101 for January 1, 2023)
episode['episode_number'] = episode['published_date'].strftime('%Y%m%d')
logger.debug(f"No season or episode number, using date as episode number: {episode['episode_number']}")
else:
# If no publication date, use a placeholder
episode['episode_number'] = "unknown"
logger.debug("No season, episode number, or date available")
# Case 2: No season, but episode number exists - keep episode number as is
elif episode['season'] is None and episode['episode_number'] is not None:
logger.debug(f"Using episode number without season: {episode['episode_number']}")
# Case 3: Season exists, no episode number - use season as prefix for ordering
elif episode['season'] is not None and (episode['episode_number'] is None or episode['episode_number'] == ''):
if episode['published_date']:
# Use the publication date with season prefix
# Format: S01_YYYYMMDD
episode['episode_number'] = f"S{episode['season']:02d}_{episode['published_date'].strftime('%Y%m%d')}"
logger.debug(f"Season without episode number, using season+date: {episode['episode_number']}")
else:
# If no publication date, use season with unknown suffix
episode['episode_number'] = f"S{episode['season']:02d}_unknown"
logger.debug(f"Season without episode number or date: {episode['episode_number']}")
# Case 4: Both season and episode exist - format as S01E02
elif episode['season'] is not None and episode['episode_number'] is not None:
# Check if episode_number is already formatted as S01E02
import re
if not re.match(r'^S\d+E\d+$', str(episode['episode_number']), re.IGNORECASE):
try:
# Try to convert episode_number to integer for proper formatting
ep_num = int(episode['episode_number'])
episode['episode_number'] = f"S{episode['season']:02d}E{ep_num:02d}"
logger.debug(f"Formatted season and episode as: {episode['episode_number']}")
except (ValueError, TypeError):
# If episode_number can't be converted to int, use as is with season prefix
episode['episode_number'] = f"S{episode['season']:02d}_{episode['episode_number']}"
logger.debug(f"Using season prefix with non-numeric episode: {episode['episode_number']}")
else:
logger.debug(f"Episode already formatted correctly: {episode['episode_number']}")
# Generate a GUID if one is not provided
if not episode['guid']:

View file

@ -128,20 +128,60 @@ def update_podcast(podcast_id, progress_callback=None):
published_date=episode_data.get('published_date'),
duration=episode_data.get('duration'),
file_size=episode_data.get('file_size'),
season=episode_data.get('season'), # Season number
episode_number=episode_data.get('episode_number'),
guid=episode_data['guid'],
downloaded=False
downloaded=False,
explicit=episode_data.get('explicit') # Explicit flag
)
db.session.add(episode)
stats['new_episodes'] += 1
logger.info(f"Added new episode: {episode.title}")
# Auto-download if enabled
if podcast.auto_download and episode.audio_url:
try:
# Need to commit first to ensure episode has an ID
# Need to commit first to ensure episode has an ID
db.session.commit()
# Check if file already exists for this episode
try:
from app.services.podcast_downloader import format_filename
import os
from app.models.settings import Settings
settings = Settings.query.first()
if not settings:
settings = Settings(
download_path=current_app.config['DOWNLOAD_PATH'],
naming_format="{podcast_title}/{episode_title}"
)
db.session.add(settings)
db.session.commit()
# Use podcast's naming format if available, otherwise use global settings
naming_format = podcast.naming_format or settings.naming_format
# Format filename using the naming format
filename = format_filename(naming_format, podcast, episode)
download_path = settings.download_path
# Check for common audio file extensions
extensions = ['.mp3', '.m4a', '.ogg', '.wav']
for ext in extensions:
file_path = os.path.normpath(os.path.join(download_path, filename + ext))
if os.path.exists(file_path):
logger.info(f"Found existing file for episode: {file_path}")
episode.downloaded = True
episode.file_path = file_path
db.session.commit()
break
logger.info(f"Checked for existing files for episode: {episode.title}")
except Exception as e:
logger.error(f"Error checking for existing files for episode {episode.title}: {str(e)}")
# Auto-download if enabled and not already downloaded
if podcast.auto_download and episode.audio_url and not episode.downloaded:
try:
download_episode(episode.id)
stats['episodes_downloaded'] += 1
logger.info(f"Auto-downloaded episode: {episode.title}")

View file

@ -172,12 +172,12 @@ class TaskManager:
with self.lock:
return list(self.tasks.values())
def clean_old_tasks(self, max_age_seconds=60):
def clean_old_tasks(self, max_age_seconds=86400):
"""
Remove old completed or failed tasks.
Args:
max_age_seconds (int): Maximum age of tasks to keep in seconds
max_age_seconds (int): Maximum age of tasks to keep in seconds (default: 24 hours)
Returns:
int: Number of tasks removed