logo

news_parse_ex

commit: fd59c289e863d304403c694419d17bff4927373c
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Mon, 26 Dec 2022 15:29:38 +0100

init

Diffstat:

A.formatter.exs4++++
A.gitignore9+++++++++
AREADME.md3+++
Alib/news_parse_ex.ex47+++++++++++++++++++++++++++++++++++++++++++++++
Alib/xml.ex45+++++++++++++++++++++++++++++++++++++++++++++
Amix.exs24++++++++++++++++++++++++
Atest/news_parse_ex_test.exs18++++++++++++++++++
Atest/test_helper.exs1+
8 files changed, 151 insertions(+), 0 deletions(-)

diff --git a/.formatter.exs b/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/.gitignore b/.gitignore @@ -0,0 +1,9 @@ +/_build/ +/cover/ +/deps/ +/doc/ +/.fetch +erl_crash.dump +*.ez +news_parse_ex-*.tar +/tmp/ diff --git a/README.md b/README.md @@ -0,0 +1,3 @@ +# NewsParseEx + +Library to parse RSS/Atom news feeds diff --git a/lib/news_parse_ex.ex b/lib/news_parse_ex.ex @@ -0,0 +1,47 @@ +# NewsParseEx: RSS/Atom parser +# Copyright © 2022 Haelwenn (lanodan) Monnier <contact+news_parse_ex@hacktivis.me> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule NewsParseEx do + alias NewsParseEx.XML + + def get_feed_type(doc) do + root_name = XML.string_from_xpath(~s[name()], doc) + + if root_name != "feed" do + {:error, "XML root isn't <feed> but #{root_name}"} + end + + case XML.string_from_xpath(~s[/feed/namespace::*], doc) do + "http://www.w3.org/2005/Atom" -> {:ok, :atom} + e -> {:error, e} + end + end + + def get_feed_title(doc, :atom), do: {:ok, XML.string_from_xpath(~s[/feed/title/text()], doc)} + def get_feed_id(doc, :atom), do: {:ok, XML.string_from_xpath(~s[/feed/id/text()], doc)} + + def get_feed_last_update(doc, :atom) do + XML.string_from_xpath(~s[/feed/updated/text()], doc) + |> DateTime.from_iso8601() + end + + def parse(str) when is_bitstring(str) do + with {_, {:ok, doc}} <- {:parse, XML.parse_document(str)}, + {_, {:ok, feed_type}} <- {:type, get_feed_type(doc)}, + {_, {:ok, title}} <- {:title, get_feed_title(doc, feed_type)}, + {_, {:ok, id}} <- {:id, get_feed_id(doc, feed_type)}, + {_, {:ok, last_update, _tz_offset}} <- + {:last_update, get_feed_last_update(doc, feed_type)} do + data = %{ + :type => feed_type, + :title => title, + :id => id, + :last_update => last_update, + :entries => [] + } + + {:ok, data} + end + end +end diff --git a/lib/xml.ex b/lib/xml.ex @@ -0,0 +1,45 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule NewsParseEx.XML do + require Logger + + def string_from_xpath(_, :error), do: nil + + def string_from_xpath(xpath, doc) do + try do + {:xmlObj, :string, res} = :xmerl_xpath.string('string(#{xpath})', doc) + + res = + res + |> to_string + |> String.trim() + + if res == "", do: nil, else: res + catch + _e -> + Logger.debug("Couldn't find xpath #{xpath} in XML doc") + nil + end + end + + def parse_document(text) do + try do + {doc, _rest} = + text + |> :binary.bin_to_list() + |> :xmerl_scan.string(quiet: true) + + {:ok, doc} + rescue + _e -> + Logger.debug("Couldn't parse XML: #{inspect(text)}") + :error + catch + :exit, _error -> + Logger.debug("Couldn't parse XML: #{inspect(text)}") + :error + end + end +end diff --git a/mix.exs b/mix.exs @@ -0,0 +1,24 @@ +defmodule NewsParseEx.MixProject do + use Mix.Project + + def project do + [ + app: :news_parse_ex, + version: "0.1.0", + elixir: "~> 1.14", + start_permanent: Mix.env() == :prod, + deps: deps() + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger, :xmerl] + ] + end + + defp deps do + [] + end +end diff --git a/test/news_parse_ex_test.exs b/test/news_parse_ex_test.exs @@ -0,0 +1,18 @@ +defmodule NewsParseExTest do + use ExUnit.Case + doctest NewsParseEx + + test "parses basic Atom feed" do + feed = ~s[<?xml version="1.0" encoding="utf-8"?> +<feed xmlns="http://www.w3.org/2005/Atom"> + <title>Test Title</title> + <id>https://example.org/feed/</id> + <updated>2021-11-01T16:09:55Z</updated> +</feed>] + + {:ok, parsed} = NewsParseEx.parse(feed) + assert(parsed.title == "Test Title") + assert(parsed.id == "https://example.org/feed/") + assert(parsed.last_update == ~U[2021-11-01 16:09:55Z]) + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()