logo

news_parse_ex

commit: 52f843eb5519e59c8a637bb82a7568d29c526f75
parent e84b5e9b5596175e9f2c0a09e1cc1ab8f323a87e
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Tue, 27 Dec 2022 14:49:50 +0100

Add initial support for Atom type attribute

Diffstat:

Mlib/news_parse_ex.ex21++++++++++++++++++++-
Mlib/xml.ex10++++++++++
Mtest/news_parse_ex_test.exs148+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
3 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/lib/news_parse_ex.ex b/lib/news_parse_ex.ex @@ -34,9 +34,28 @@ defmodule NewsParseEx do {:error, "XML root isn't <feed> but <#{root_name}>"} end - defp get_feed_title(doc, :atom), do: XML.string_from_xpath(~s[/feed/title/text()], doc) defp get_feed_title(doc, :rss2_0), do: XML.string_from_xpath(~s[/rss/channel/title/text()], doc) + defp get_feed_title(doc, :atom) do + with {:ok, type} <- XML.string_from_xpath(~s[/feed/title/@type], doc) do + get_feed_title(doc, :atom, type) + end + end + + defp get_feed_title(doc, :atom, "html") do + title = :xmerl_xpath.string('/feed/title/child::node()', doc) + {:ok, {:html, title}} + end + + defp get_feed_title(doc, :atom, "xhtml") do + title = :xmerl_xpath.string('/feed/title/*[local-name(.)="div"]', doc) + {:ok, {:xhtml, title}} + end + + defp get_feed_title(doc, :atom, _) do + XML.string_from_xpath(~s[/feed/title/text()], doc) + end + defp get_feed_id(doc, :atom), do: XML.string_from_xpath(~s[/feed/id/text()], doc) defp get_feed_id(doc, :rss2_0), do: XML.string_from_xpath(~s[/rss/channel/link/text()], doc) diff --git a/lib/xml.ex b/lib/xml.ex @@ -19,6 +19,16 @@ defmodule NewsParseEx.XML do {:ok, res} end + def string_from_doc(doc) do + :xmerl_xpath.string('//text()', doc) + |> Enum.map(fn x -> + {:xmlObj, :string, y} = :xmerl_xpath.string('string(.)', x) + y + end) + |> Enum.join() + |> String.trim() + end + def parse_document(text) do try do {doc, _rest} = diff --git a/test/news_parse_ex_test.exs b/test/news_parse_ex_test.exs @@ -1,59 +1,133 @@ defmodule NewsParseExTest do use ExUnit.Case + + alias NewsParseEx.XML + doctest NewsParseEx - test "parses basic Atom feed" do - feed = ~s[<?xml version="1.0" encoding="utf-8"?> + describe "Parses Atom Feed Document" do + test "basic Atom feed" do + feed = ~s[<?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> <title>Test Title</title> <id>https://example.org/feed/</id> <updated>2021-11-01T16:09:55Z</updated> </feed>] - {:ok, parsed} = NewsParseEx.parse(feed) - assert(parsed.title == "Test Title") - assert(parsed.description == nil) - assert(parsed.id == "https://example.org/feed/") - assert(parsed.last_update == ~U[2021-11-01 16:09:55Z]) - end + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "Test Title") + assert(parsed.description == nil) + assert(parsed.id == "https://example.org/feed/") + assert(parsed.last_update == ~U[2021-11-01 16:09:55Z]) + end - test "parses Gitlab atom feed" do - feed = File.read!("test/fixtures/gitlab/wlroots-2022-12-27.atom") + test "Gitlab atom feed" do + feed = File.read!("test/fixtures/gitlab/wlroots-2022-12-27.atom") - {:ok, parsed} = NewsParseEx.parse(feed) - assert(parsed.title == "wlroots tags") - assert(parsed.description == nil) - assert(parsed.id == "https://gitlab.freedesktop.org/wlroots/wlroots/-/tags") - assert(parsed.last_update == ~U[2021-11-01T16:09:55Z]) - end + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "wlroots tags") + assert(parsed.description == nil) + assert(parsed.id == "https://gitlab.freedesktop.org/wlroots/wlroots/-/tags") + assert(parsed.last_update == ~U[2021-11-01T16:09:55Z]) + end + + test "RFC4287 brief, single-entry Atom Feed Document" do + feed = File.read!("test/fixtures/rfc4287/brief_single_entry.atom") - test "parses RFC4287 brief, single-entry Atom Feed Document" do - feed = File.read!("test/fixtures/rfc4287/brief_single_entry.atom") + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "Example Feed") + assert(parsed.description == nil) + assert(parsed.id == "urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6") + assert(parsed.last_update == ~U[2003-12-13T18:30:02Z]) + end - {:ok, parsed} = NewsParseEx.parse(feed) - assert(parsed.title == "Example Feed") - assert(parsed.description == nil) - assert(parsed.id == "urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6") - assert(parsed.last_update == ~U[2003-12-13T18:30:02Z]) + test "RFC4287 more extensive, single-entry Atom Feed Document" do + feed = File.read!("test/fixtures/rfc4287/extensive_single_entry.atom") + + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "dive into mark") + assert(parsed.description == nil) + assert(parsed.id == "tag:example.org,2003:3") + assert(parsed.last_update == ~U[2005-07-31T12:29:29Z]) + end end - test "parses RFC4287 more extensive, single-entry Atom Feed Document" do - feed = File.read!("test/fixtures/rfc4287/extensive_single_entry.atom") + describe "Parses Atom title" do + setup do + %{feed: ~s[<?xml version="1.0" encoding="utf-8"?> +<feed xmlns="http://www.w3.org/2005/Atom"> + {{title}} + <id>https://example.org/feed/</id> + <updated>2021-11-01T16:09:55Z</updated> +</feed>]} + end + + test "plain text", %{feed: feed} do + {:ok, parsed} = + feed + |> String.replace("{{title}}", ~s{<title type="text">Less: &lt;</title>}) + |> NewsParseEx.parse() + + assert(parsed.title == "Less: <") + end + + test "HTML", %{feed: feed} do + {:ok, parsed} = + feed + |> String.replace( + "{{title}}", + ~s{<title type="html">Less: &lt;em> &amp;lt; &lt;/em></title>} + ) + |> NewsParseEx.parse() + + assert( + parsed.title == + {:html, [{:xmlText, [title: 2, feed: 1], 1, [], 'Less: <em> &lt; </em>', :text}]} + ) + end + + test "XHTML xhtml:div", %{feed: feed} do + title = ~s{<title type="xhtml" xmlns:xhtml="http://www.w3.org/1999/xhtml"> + <xhtml:div> + Less: <xhtml:em> &lt; </xhtml:em> + </xhtml:div> +</title>} + + {:ok, parsed} = + feed + |> String.replace("{{title}}", title) + |> NewsParseEx.parse() + + assert({:xhtml, [title]} = parsed.title) + assert(XML.string_from_doc(title) == "Less: \n <") + end + + test "XHTML div", %{feed: feed} do + title = ~s{<title type="xhtml" xmlns:xhtml="http://www.w3.org/1999/xhtml"> + <div xmlns="http://www.w3.org/1999/xhtml"> + Less: <em> &lt; </em> + </div> +</title>} + + {:ok, parsed} = + feed + |> String.replace("{{title}}", title) + |> NewsParseEx.parse() - {:ok, parsed} = NewsParseEx.parse(feed) - assert(parsed.title == "dive into mark") - assert(parsed.description == nil) - assert(parsed.id == "tag:example.org,2003:3") - assert(parsed.last_update == ~U[2005-07-31T12:29:29Z]) + assert({:xhtml, [title]} = parsed.title) + assert(XML.string_from_doc(title) == "Less: \n <") + end end - test "parses git.sr.ht RSS feed" do - feed = File.read!("test/fixtures/git.sr.ht/pkgconf-2022-12-27.rss") + describe "Parses RSS Document" do + test "git.sr.ht tags" do + feed = File.read!("test/fixtures/git.sr.ht/pkgconf-2022-12-27.rss") - {:ok, parsed} = NewsParseEx.parse(feed) - assert(parsed.title == "~kaniini/pkgconf refs") - assert(parsed.description == "Git refs for ~kaniini/pkgconf") - assert(parsed.id == "https://git.sr.ht/~kaniini/pkgconf/refs") - assert(parsed.last_update == nil) + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "~kaniini/pkgconf refs") + assert(parsed.description == "Git refs for ~kaniini/pkgconf") + assert(parsed.id == "https://git.sr.ht/~kaniini/pkgconf/refs") + assert(parsed.last_update == nil) + end end end