logo

news_parse_ex

commit: 1f3340d3604fa37fa5f2812c7bfa87b5ae856ba7
parent 96f57d300aefb7dec1edc982e423634dc0b67fbf
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Sat,  4 Feb 2023 12:16:53 +0100

Split Atom and RSS2.0 in two modules

Diffstat:

Alib/atom.ex61+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlib/news_parse_ex.ex110+++++++------------------------------------------------------------------------
Alib/rss2_0.ex66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 136 insertions(+), 101 deletions(-)

diff --git a/lib/atom.ex b/lib/atom.ex @@ -0,0 +1,61 @@ +# NewsParseEx: RSS/Atom parser +# Copyright © 2022-2023 Haelwenn (lanodan) Monnier <contact+news_parse_ex@hacktivis.me> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule NewsParseEx.Atom do + alias NewsParseEx.XML + + defp get_feed_title(doc) do + with {:ok, type} <- XML.string_from_xpath(~s[/feed/title/@type], doc) do + get_feed_title(doc, type) + end + end + + defp get_feed_title(doc, "html") do + title = :xmerl_xpath.string('/feed/title/child::node()', doc) + {:ok, {:html, title}} + end + + defp get_feed_title(doc, "xhtml") do + title = :xmerl_xpath.string('/feed/title/*[local-name(.)="div"]', doc) + {:ok, {:xhtml, title}} + end + + defp get_feed_title(doc, _) do + XML.string_from_xpath(~s[/feed/title/text()], doc) + end + + defp get_feed_id(doc), do: XML.string_from_xpath(~s[/feed/id/text()], doc) + + defp get_feed_last_update(doc) do + {:ok, updated} = XML.string_from_xpath(~s[/feed/updated/text()], doc) + DateTime.from_iso8601(updated) + end + + defp get_feed_description(_doc), do: {:ok, nil} + + defp get_feed_entries(_doc) do + # FIXME + {:ok, []} + end + + def parse(doc) do + with {_, {:ok, id}} <- {:id, get_feed_id(doc)}, + {_, {:ok, last_update, _tz_offset}} <- + {:last_update, get_feed_last_update(doc)}, + {_, {:ok, title}} <- {:title, get_feed_title(doc)}, + {_, {:ok, description}} <- {:desc, get_feed_description(doc)}, + {_, {:ok, entries}} <- {:entries, get_feed_entries(doc)} do + data = %{ + :parser => NewsParseEx.Atom, + :title => title, + :description => description, + :id => id, + :last_update => last_update, + :entries => entries + } + + {:ok, data} + end + end +end diff --git a/lib/news_parse_ex.ex b/lib/news_parse_ex.ex @@ -4,132 +4,40 @@ defmodule NewsParseEx do alias NewsParseEx.XML - alias NewsParseEx.Maps - defp get_feed_type(doc) do + defp get_feed_parser(doc) do with {:ok, root_name} <- XML.string_from_xpath(~s[name()], doc) do - get_feed_type(doc, root_name) + get_feed_parser(doc, root_name) end end - defp get_feed_type(doc, "feed") do + defp get_feed_parser(doc, "feed") do with {:ok, namespace} <- XML.string_from_xpath(~s{/feed/namespace::*[name()='']}, doc) do if namespace == "http://www.w3.org/2005/Atom" do - {:ok, :atom} + {:ok, NewsParseEx.Atom} else {:error, "Atom feed with wrong root namespace: #{namespace}"} end end end - defp get_feed_type(doc, "rss") do + defp get_feed_parser(doc, "rss") do with {:ok, version} <- XML.string_from_xpath(~s{/rss/@version}, doc) do case version do - "2.0" -> {:ok, :rss2_0} + "2.0" -> {:ok, NewsParseEx.RSS2_0} version -> {:error, "RSS with unknown version: #{version}"} end end end - defp get_feed_type(_doc, root_name) do + defp get_feed_parser(_doc, root_name) do {:error, "XML root isn't <feed> but <#{root_name}>"} end - defp get_feed_title(doc, :rss2_0), do: XML.string_from_xpath(~s[/rss/channel/title/text()], doc) - - defp get_feed_title(doc, :atom) do - with {:ok, type} <- XML.string_from_xpath(~s[/feed/title/@type], doc) do - get_feed_title(doc, :atom, type) - end - end - - defp get_feed_title(doc, :atom, "html") do - title = :xmerl_xpath.string('/feed/title/child::node()', doc) - {:ok, {:html, title}} - end - - defp get_feed_title(doc, :atom, "xhtml") do - title = :xmerl_xpath.string('/feed/title/*[local-name(.)="div"]', doc) - {:ok, {:xhtml, title}} - end - - defp get_feed_title(doc, :atom, _) do - XML.string_from_xpath(~s[/feed/title/text()], doc) - end - - defp get_feed_id(doc, :atom), do: XML.string_from_xpath(~s[/feed/id/text()], doc) - defp get_feed_id(doc, :rss2_0), do: XML.string_from_xpath(~s[/rss/channel/link/text()], doc) - - defp get_feed_last_update(doc, :atom) do - {:ok, updated} = XML.string_from_xpath(~s[/feed/updated/text()], doc) - DateTime.from_iso8601(updated) - end - - defp get_feed_last_update(_doc, :rss2_0), do: {:ok, nil, nil} - - defp get_feed_description(_doc, :atom), do: {:ok, nil} - - defp get_feed_description(doc, :rss2_0), - do: XML.string_from_xpath(~s[/rss/channel/description/text()], doc) - - defp get_feed_entries(_doc, :atom) do - # FIXME - {:ok, []} - end - - defp get_feed_entries(doc, :rss2_0) do - items = :xmerl_xpath.string('/rss/channel/item', doc) - - if length(items) != 0 do - entries = Enum.map(items, &get_feed_entry(&1, :rss2_0)) - {:ok, entries} - else - {:ok, []} - end - end - - defp get_entry_title(frag, :rss2_0), do: XML.string_from_xpath(~s{/item/title}, frag) - - defp get_entry_description(frag, :rss2_0), - do: XML.string_from_xpath(~s{/item/description}, frag) - - defp get_entry_link(frag, :rss2_0), do: XML.string_from_xpath(~s{/item/link}, frag) - defp get_entry_id(frag, :rss2_0), do: XML.string_from_xpath(~s{/item/guid}, frag) - - defp get_entry_published(frag, :rss2_0) do - with {:ok, pubDate} <- XML.string_from_xpath(~s{/item/pubDate}, frag) do - Calendar.DateTime.Parse.rfc822_utc(pubDate) - end - end - - defp get_feed_entry(frag, :rss2_0) do - %{} - |> Maps.put_if_ok(:title, get_entry_title(frag, :rss2_0)) - |> Maps.put_if_ok(:id, get_entry_id(frag, :rss2_0)) - |> Maps.put_if_ok(:link, get_entry_link(frag, :rss2_0)) - |> Maps.put_if_ok(:published, get_entry_published(frag, :rss2_0)) - |> Maps.put_if_ok(:description, get_entry_description(frag, :rss2_0)) - end - def parse(str) when is_bitstring(str) do with {_, {:ok, doc}} <- {:parse, XML.parse_document(str)}, - {_, {:ok, feed_type}} <- {:type, get_feed_type(doc)}, - {_, {:ok, id}} <- {:id, get_feed_id(doc, feed_type)}, - {_, {:ok, last_update, _tz_offset}} <- - {:last_update, get_feed_last_update(doc, feed_type)}, - {_, {:ok, title}} <- {:title, get_feed_title(doc, feed_type)}, - {_, {:ok, description}} <- {:desc, get_feed_description(doc, feed_type)}, - {_, {:ok, entries}} <- {:entries, get_feed_entries(doc, feed_type)} do - data = %{ - :type => feed_type, - :title => title, - :description => description, - :id => id, - :last_update => last_update, - :entries => entries - } - - {:ok, data} + {_, {:ok, parser}} <- {:parser, get_feed_parser(doc)} do + parser.parse(doc) end end end diff --git a/lib/rss2_0.ex b/lib/rss2_0.ex @@ -0,0 +1,66 @@ +# NewsParseEx: RSS/Atom parser +# Copyright © 2022-2023 Haelwenn (lanodan) Monnier <contact+news_parse_ex@hacktivis.me> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule NewsParseEx.RSS2_0 do + alias NewsParseEx.Maps + alias NewsParseEx.XML + + defp get_feed_title(doc), do: XML.string_from_xpath(~s[/rss/channel/title/text()], doc) + defp get_feed_id(doc), do: XML.string_from_xpath(~s[/rss/channel/link/text()], doc) + defp get_feed_last_update(_doc), do: {:ok, nil, nil} + + defp get_feed_description(doc), + do: XML.string_from_xpath(~s[/rss/channel/description/text()], doc) + + defp get_feed_entries(doc) do + items = :xmerl_xpath.string('/rss/channel/item', doc) + + if length(items) != 0 do + entries = Enum.map(items, &get_feed_entry(&1)) + {:ok, entries} + else + {:ok, []} + end + end + + defp get_entry_title(frag), do: XML.string_from_xpath(~s{/item/title}, frag) + defp get_entry_description(frag), do: XML.string_from_xpath(~s{/item/description}, frag) + defp get_entry_link(frag), do: XML.string_from_xpath(~s{/item/link}, frag) + defp get_entry_id(frag), do: XML.string_from_xpath(~s{/item/guid}, frag) + + defp get_entry_published(frag) do + with {:ok, pubDate} <- XML.string_from_xpath(~s{/item/pubDate}, frag) do + Calendar.DateTime.Parse.rfc822_utc(pubDate) + end + end + + defp get_feed_entry(frag) do + %{} + |> Maps.put_if_ok(:title, get_entry_title(frag)) + |> Maps.put_if_ok(:id, get_entry_id(frag)) + |> Maps.put_if_ok(:link, get_entry_link(frag)) + |> Maps.put_if_ok(:published, get_entry_published(frag)) + |> Maps.put_if_ok(:description, get_entry_description(frag)) + end + + def parse(doc) do + with {_, {:ok, id}} <- {:id, get_feed_id(doc)}, + {_, {:ok, last_update, _tz_offset}} <- + {:last_update, get_feed_last_update(doc)}, + {_, {:ok, title}} <- {:title, get_feed_title(doc)}, + {_, {:ok, description}} <- {:desc, get_feed_description(doc)}, + {_, {:ok, entries}} <- {:entries, get_feed_entries(doc)} do + data = %{ + :parser => NewsParseEx.RSS2_0, + :title => title, + :description => description, + :id => id, + :last_update => last_update, + :entries => entries + } + + {:ok, data} + end + end +end