Login
1 branch 0 tags
Ben (T14/NixOS) Minor Makefile cleanup 7080bfb 28 days ago 85 Commits
moon / src / podcast_xml.c
#include "podcast_xml.h"
#include <stdio.h>
#include <string.h>
#include "storage.h"
#include "yxml.h"

#define XML_BUF_SIZE 1024
#define XML_STACK_SIZE 2048

// Trim leading and trailing whitespace in place
static void str_trim(char* s) {
	// Trim leading
	char* start = s;
	while (*start == ' ' || *start == '\n' || *start == '\r' ||
	       *start == '\t') {
		start++;
	}
	if (start != s) {
		memmove(s, start, strlen(start) + 1);
	}
	// Trim trailing
	size_t len = strlen(s);
	while (len > 0 && (s[len - 1] == ' ' || s[len - 1] == '\n' ||
	                   s[len - 1] == '\r' || s[len - 1] == '\t')) {
		s[--len] = '\0';
	}
}

// Append character to buffer with bounds check
static void buf_append(char* buf, size_t buf_size, size_t* len, char c) {
	if (*len < buf_size - 1) {
		buf[*len] = c;
		(*len)++;
		buf[*len] = '\0';
	}
}

// Append yxml data string to buffer
static void buf_append_data(char* buf,
                            size_t buf_size,
                            size_t* len,
                            const char* data) {
	for (const char* p = data; *p; p++) {
		buf_append(buf, buf_size, len, *p);
	}
}

int podcast_parse_opml(const char* path, podcast_feed_t* feeds, int max_feeds) {
	storage_file_t f = storage_open(path, "r");
	if (!f) {
		return -1;
	}

	unsigned char stack[XML_STACK_SIZE];
	yxml_t x;
	yxml_init(&x, stack, sizeof(stack));

	char read_buf[XML_BUF_SIZE];
	int count = 0;

	// Track current attribute being read
	char current_attr[32] = {0};
	char attr_text[128] = {0};
	size_t attr_text_len = 0;
	char attr_url[256] = {0};
	size_t attr_url_len = 0;
	char attr_type[16] = {0};
	size_t attr_type_len = 0;
	bool in_outline = false;

	size_t n;
	while ((n = storage_read(f, read_buf, sizeof(read_buf))) > 0) {
		for (size_t i = 0; i < n; i++) {
			yxml_ret_t r = yxml_parse(&x, read_buf[i]);
			if (r < 0) {
				goto done;
			}

			switch (r) {
				case YXML_ELEMSTART:
					if (strcmp(x.elem, "outline") == 0) {
						in_outline = true;
						attr_text[0] = '\0';
						attr_text_len = 0;
						attr_url[0] = '\0';
						attr_url_len = 0;
						attr_type[0] = '\0';
						attr_type_len = 0;
					}
					break;

				case YXML_ATTRSTART:
					strncpy(current_attr, x.attr, sizeof(current_attr) - 1);
					current_attr[sizeof(current_attr) - 1] = '\0';
					break;

				case YXML_ATTRVAL:
					if (in_outline) {
						if (strcmp(current_attr, "text") == 0 ||
						    strcmp(current_attr, "title") == 0) {
							buf_append_data(attr_text, sizeof(attr_text),
							                &attr_text_len, x.data);
						} else if (strcmp(current_attr, "xmlUrl") == 0) {
							buf_append_data(attr_url, sizeof(attr_url),
							                &attr_url_len, x.data);
						} else if (strcmp(current_attr, "type") == 0) {
							buf_append_data(attr_type, sizeof(attr_type),
							                &attr_type_len, x.data);
						}
					}
					break;

				case YXML_ATTREND:
					current_attr[0] = '\0';
					break;

				case YXML_ELEMEND:
					if (in_outline && attr_url[0] != '\0' &&
					    count < max_feeds) {
						// Accept rss type or any outline with xmlUrl
						if (attr_type[0] == '\0' ||
						    strcmp(attr_type, "rss") == 0) {
							strncpy(feeds[count].title, attr_text,
							        sizeof(feeds[count].title) - 1);
							feeds[count].title[sizeof(feeds[count].title) - 1] =
							    '\0';
							strncpy(feeds[count].url, attr_url,
							        sizeof(feeds[count].url) - 1);
							feeds[count].url[sizeof(feeds[count].url) - 1] =
							    '\0';
							count++;
						}
					}
					in_outline = false;
					break;

				default:
					break;
			}
		}
	}

done:
	storage_close(f);
	return count;
}

int podcast_parse_rss(const char* path,
                      podcast_episode_t* episodes,
                      int max_episodes) {
	storage_file_t f = storage_open(path, "r");
	if (!f) {
		return -1;
	}

	unsigned char stack[XML_STACK_SIZE];
	yxml_t x;
	yxml_init(&x, stack, sizeof(stack));

	char read_buf[XML_BUF_SIZE];
	int count = 0;

	// Parser state
	bool in_item = false;
	int depth = 0;
	int item_depth = 0;

	char current_elem[64] = {0};
	char current_attr[32] = {0};
	char title[128] = {0};
	size_t title_len = 0;
	char guid[128] = {0};
	size_t guid_len = 0;
	char enclosure_url[256] = {0};
	size_t enclosure_url_len = 0;
	char enclosure_type[64] = {0};
	size_t enclosure_type_len = 0;
	char pub_date[64] = {0};
	size_t pub_date_len = 0;

	size_t n;
	while ((n = storage_read(f, read_buf, sizeof(read_buf))) > 0) {
		for (size_t i = 0; i < n; i++) {
			yxml_ret_t r = yxml_parse(&x, read_buf[i]);
			if (r < 0) {
				goto rss_done;
			}

			switch (r) {
				case YXML_ELEMSTART:
					depth++;
					strncpy(current_elem, x.elem, sizeof(current_elem) - 1);
					current_elem[sizeof(current_elem) - 1] = '\0';

					if (strcmp(x.elem, "item") == 0 && !in_item) {
						in_item = true;
						item_depth = depth;
						title[0] = '\0';
						title_len = 0;
						guid[0] = '\0';
						guid_len = 0;
						enclosure_url[0] = '\0';
						enclosure_url_len = 0;
						enclosure_type[0] = '\0';
						enclosure_type_len = 0;
						pub_date[0] = '\0';
						pub_date_len = 0;
					}
					break;

				case YXML_CONTENT:
					if (in_item) {
						if (strcmp(current_elem, "title") == 0) {
							buf_append_data(title, sizeof(title), &title_len,
							                x.data);
						} else if (strcmp(current_elem, "guid") == 0) {
							buf_append_data(guid, sizeof(guid), &guid_len,
							                x.data);
						} else if (strcmp(current_elem, "pubDate") == 0) {
							buf_append_data(pub_date, sizeof(pub_date),
							                &pub_date_len, x.data);
						}
					}
					break;

				case YXML_ATTRSTART:
					strncpy(current_attr, x.attr, sizeof(current_attr) - 1);
					current_attr[sizeof(current_attr) - 1] = '\0';
					break;

				case YXML_ATTRVAL:
					if (in_item && strcmp(current_elem, "enclosure") == 0) {
						if (strcmp(current_attr, "url") == 0) {
							buf_append_data(enclosure_url,
							                sizeof(enclosure_url),
							                &enclosure_url_len, x.data);
						} else if (strcmp(current_attr, "type") == 0) {
							buf_append_data(enclosure_type,
							                sizeof(enclosure_type),
							                &enclosure_type_len, x.data);
						}
					}
					break;

				case YXML_ATTREND:
					current_attr[0] = '\0';
					break;

				case YXML_ELEMEND:
					current_elem[0] = '\0';
					if (in_item && depth == item_depth) {
						// End of <item>
						if (enclosure_url[0] != '\0' &&
						    strncmp(enclosure_type, "audio/", 6) == 0 &&
						    count < max_episodes) {
							podcast_episode_t* ep = &episodes[count];
							strncpy(ep->title, title, sizeof(ep->title) - 1);
							ep->title[sizeof(ep->title) - 1] = '\0';
							strncpy(ep->guid, guid[0] ? guid : title,
							        sizeof(ep->guid) - 1);
							ep->guid[sizeof(ep->guid) - 1] = '\0';
							strncpy(ep->enclosure_url, enclosure_url,
							        sizeof(ep->enclosure_url) - 1);
							ep->enclosure_url[sizeof(ep->enclosure_url) - 1] =
							    '\0';
							strncpy(ep->pub_date, pub_date,
							        sizeof(ep->pub_date) - 1);
							ep->pub_date[sizeof(ep->pub_date) - 1] = '\0';
							count++;
						}
						in_item = false;
					}
					depth--;
					break;

				default:
					break;
			}
		}
	}

rss_done:
	storage_close(f);
	return count;
}

void podcast_sanitize_title(const char* title, char* out, size_t out_size) {
	if (out_size == 0) {
		return;
	}
	size_t max_len = out_size - 1;
	size_t len = 0;
	for (const char* p = title; *p && len < max_len; p++) {
		char c = *p;
		if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
		    (c >= '0' && c <= '9') || c == '.' || c == '_' || c == '-') {
			out[len++] = c;
		} else if (len > 0 && out[len - 1] != '_') {
			out[len++] = '_';
		}
	}
	while (len > 0 && out[len - 1] == '_') {
		len--;
	}
	if (len == 0) {
		out[0] = '_';
		len = 1;
	}
	out[len] = '\0';
}

// Write XML-escaped string to file
static void write_escaped(storage_file_t f, const char* s) {
	for (const char* p = s; *p; p++) {
		switch (*p) {
			case '&':
				storage_write(f, "&amp;", 5);
				break;
			case '<':
				storage_write(f, "&lt;", 4);
				break;
			case '>':
				storage_write(f, "&gt;", 4);
				break;
			case '"':
				storage_write(f, "&quot;", 6);
				break;
			default:
				storage_write(f, p, 1);
				break;
		}
	}
}

static void write_str(storage_file_t f, const char* s) {
	storage_write(f, s, strlen(s));
}

bool podcast_write_entry(const char* out_path, const podcast_episode_t* ep) {
	storage_file_t f = storage_open(out_path, "w");
	if (!f) {
		return false;
	}

	write_str(f, "<item>\n");

	if (ep->title[0]) {
		write_str(f, "<title>");
		write_escaped(f, ep->title);
		write_str(f, "</title>\n");
	}

	if (ep->guid[0]) {
		write_str(f, "<guid>");
		write_escaped(f, ep->guid);
		write_str(f, "</guid>\n");
	}

	if (ep->pub_date[0]) {
		write_str(f, "<pubDate>");
		write_escaped(f, ep->pub_date);
		write_str(f, "</pubDate>\n");
	}

	if (ep->enclosure_url[0]) {
		write_str(f, "<enclosure url=\"");
		write_escaped(f, ep->enclosure_url);
		write_str(f, "\" type=\"audio/mpeg\"/>\n");
	}

	write_str(f, "</item>\n");
	storage_close(f);
	return true;
}

bool podcast_parse_entry(const char* path, podcast_episode_t* ep) {
	storage_file_t f = storage_open(path, "r");
	if (!f) {
		return false;
	}

	memset(ep, 0, sizeof(*ep));

	unsigned char stack[XML_STACK_SIZE];
	yxml_t x;
	yxml_init(&x, stack, sizeof(stack));

	char read_buf[XML_BUF_SIZE];
	char current_elem[64] = {0};
	char current_attr[32] = {0};
	size_t title_len = 0;
	size_t guid_len = 0;
	size_t url_len = 0;
	size_t type_len = 0;
	size_t date_len = 0;
	char enclosure_type[64] = {0};
	bool in_item = false;

	size_t n;
	while ((n = storage_read(f, read_buf, sizeof(read_buf))) > 0) {
		for (size_t i = 0; i < n; i++) {
			yxml_ret_t r = yxml_parse(&x, read_buf[i]);
			if (r < 0) {
				goto entry_done;
			}

			switch (r) {
				case YXML_ELEMSTART:
					strncpy(current_elem, x.elem, sizeof(current_elem) - 1);
					current_elem[sizeof(current_elem) - 1] = '\0';
					if (strcmp(x.elem, "item") == 0) {
						in_item = true;
					}
					break;

				case YXML_CONTENT:
					if (!in_item) {
						break;
					}
					if (strcmp(current_elem, "title") == 0) {
						buf_append_data(ep->title, sizeof(ep->title),
						                &title_len, x.data);
					} else if (strcmp(current_elem, "guid") == 0) {
						buf_append_data(ep->guid, sizeof(ep->guid), &guid_len,
						                x.data);
					} else if (strcmp(current_elem, "pubDate") == 0) {
						buf_append_data(ep->pub_date, sizeof(ep->pub_date),
						                &date_len, x.data);
					}
					break;

				case YXML_ATTRSTART:
					strncpy(current_attr, x.attr, sizeof(current_attr) - 1);
					current_attr[sizeof(current_attr) - 1] = '\0';
					break;

				case YXML_ATTRVAL:
					if (in_item && strcmp(current_elem, "enclosure") == 0) {
						if (strcmp(current_attr, "url") == 0) {
							buf_append_data(ep->enclosure_url,
							                sizeof(ep->enclosure_url), &url_len,
							                x.data);
						} else if (strcmp(current_attr, "type") == 0) {
							buf_append_data(enclosure_type,
							                sizeof(enclosure_type), &type_len,
							                x.data);
						}
					}
					break;

				case YXML_ATTREND:
					current_attr[0] = '\0';
					break;

				default:
					break;
			}
		}
	}

entry_done:
	storage_close(f);

	str_trim(ep->title);
	str_trim(ep->guid);
	str_trim(ep->pub_date);
	str_trim(ep->enclosure_url);

	// Use title as guid fallback
	if (!ep->guid[0] && ep->title[0]) {
		snprintf(ep->guid, sizeof(ep->guid), "%s", ep->title);
	}

	return ep->title[0] || ep->enclosure_url[0];
}