commit: 45445057618ff1d9b3c4980e6befc69d56a6142f
parent b4c5cc39f65e3cd3e5e86be4631a3dcfb4bda593
Author: feld <feld@feld.me>
Date: Wed, 17 Jul 2024 17:50:17 +0000
Merge branch 'oban/rich-media-invalid' into 'develop'
Rework some Rich Media functionality for better error handling
See merge request pleroma/pleroma!4182
Diffstat:
8 files changed, 74 insertions(+), 50 deletions(-)
diff --git a/changelog.d/oban-rich-media-errors.fix b/changelog.d/oban-rich-media-errors.fix
@@ -0,0 +1 @@
+Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again.
diff --git a/lib/pleroma/web/rich_media/backfill.ex b/lib/pleroma/web/rich_media/backfill.ex
@@ -4,6 +4,7 @@
defmodule Pleroma.Web.RichMedia.Backfill do
alias Pleroma.Web.RichMedia.Card
+ alias Pleroma.Web.RichMedia.Helpers
alias Pleroma.Web.RichMedia.Parser
alias Pleroma.Web.RichMedia.Parser.TTL
alias Pleroma.Workers.RichMediaWorker
@@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do
Pleroma.Web.ActivityPub.ActivityPub
)
- @spec run(map()) ::
- :ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()}
+ @spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors()
def run(%{"url" => url} = args) do
url_hash = Card.url_to_hash(url)
@@ -33,22 +33,16 @@ defmodule Pleroma.Web.RichMedia.Backfill do
end
warm_cache(url_hash, card)
+ :ok
- {:error, {:invalid_metadata, fields}} ->
- Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
- negative_cache(url_hash)
-
- {:error, :body_too_large} ->
- Logger.error("Rich media error for #{url}: :body_too_large")
- negative_cache(url_hash)
-
- {:error, {:content_type, type}} ->
- Logger.debug("Rich media error for #{url}: :content_type is #{type}")
+ {:error, type} = error
+ when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
negative_cache(url_hash)
+ error
- e ->
- Logger.debug("Rich media error for #{url}: #{inspect(e)}")
- {:error, e}
+ {:error, type} = error
+ when type in [:get, :head] ->
+ error
end
end
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex
@@ -5,26 +5,38 @@
defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Config
+ require Logger
+
+ @type get_errors :: {:error, :body_too_large | :content_type | :head | :get}
+
+ @spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors()
def rich_media_get(url) do
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
- head_check =
- case Pleroma.HTTP.head(url, headers, http_options()) do
- # If the HEAD request didn't reach the server for whatever reason,
- # we assume the GET that comes right after won't either
- {:error, _} = e ->
- e
+ with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <-
+ {:head, Pleroma.HTTP.head(url, headers, http_options())},
+ {_, :ok} <- {:content_type, check_content_type(headers)},
+ {_, :ok} <- {:content_length, check_content_length(headers)},
+ {_, {:ok, %Tesla.Env{status: 200, body: body}}} <-
+ {:get, Pleroma.HTTP.get(url, headers, http_options())} do
+ {:ok, body}
+ else
+ {:head, _} ->
+ Logger.debug("Rich media error for #{url}: HTTP HEAD failed")
+ {:error, :head}
- {:ok, %Tesla.Env{status: 200, headers: headers}} ->
- with :ok <- check_content_type(headers),
- :ok <- check_content_length(headers),
- do: :ok
+ {:content_type, {_, type}} ->
+ Logger.debug("Rich media error for #{url}: content-type is #{type}")
+ {:error, :content_type}
- _ ->
- :ok
- end
+ {:content_length, {_, length}} ->
+ Logger.debug("Rich media error for #{url}: content-length is #{length}")
+ {:error, :body_too_large}
- with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
+ {:get, _} ->
+ Logger.debug("Rich media error for #{url}: HTTP GET failed")
+ {:error, :get}
+ end
end
defp check_content_type(headers) do
@@ -32,7 +44,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
{_, content_type} ->
case Plug.Conn.Utils.media_type(content_type) do
{:ok, "text", "html", _} -> :ok
- _ -> {:error, {:content_type, content_type}}
+ _ -> {:error, content_type}
end
_ ->
@@ -47,7 +59,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
{_, maybe_content_length} ->
case Integer.parse(maybe_content_length) do
{content_length, ""} when content_length <= max_body -> :ok
- {_, ""} -> {:error, :body_too_large}
+ {_, ""} -> {:error, maybe_content_length}
_ -> :ok
end
diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
@@ -3,6 +3,7 @@
# SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser do
+ alias Pleroma.Web.RichMedia.Helpers
require Logger
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
@@ -11,24 +12,26 @@ defmodule Pleroma.Web.RichMedia.Parser do
Pleroma.Config.get([:rich_media, :parsers])
end
- def parse(nil), do: nil
+ @type parse_errors :: {:error, :rich_media_disabled | :validate}
- @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
- def parse(url) do
+ @spec parse(String.t()) ::
+ {:ok, map()} | parse_errors() | Helpers.get_errors()
+ def parse(url) when is_binary(url) do
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
- :ok <- validate_page_url(url),
- {:ok, data} <- parse_url(url) do
+ {_, :ok} <- {:validate, validate_page_url(url)},
+ {_, {:ok, data}} <- {:parse, parse_url(url)} do
data = Map.put(data, "url", url)
{:ok, data}
else
{:config, _} -> {:error, :rich_media_disabled}
- e -> e
+ {:validate, _} -> {:error, :validate}
+ {:parse, error} -> error
end
end
defp parse_url(url) do
- with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
- {:ok, html} <- Floki.parse_document(html) do
+ with {:ok, body} <- Helpers.rich_media_get(url),
+ {:ok, html} <- Floki.parse_document(body) do
html
|> maybe_parse()
|> clean_parsed_data()
@@ -50,8 +53,8 @@ defmodule Pleroma.Web.RichMedia.Parser do
{:ok, data}
end
- defp check_parsed_data(data) do
- {:error, {:invalid_metadata, data}}
+ defp check_parsed_data(_data) do
+ {:error, :invalid_metadata}
end
defp clean_parsed_data(data) do
diff --git a/lib/pleroma/web/rich_media/parsers/o_embed.ex b/lib/pleroma/web/rich_media/parsers/o_embed.ex
@@ -22,7 +22,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do
end
defp get_oembed_data(url) do
- with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
+ with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
Jason.decode(json)
end
end
diff --git a/lib/pleroma/workers/rich_media_worker.ex b/lib/pleroma/workers/rich_media_worker.ex
@@ -14,7 +14,21 @@ defmodule Pleroma.Workers.RichMediaWorker do
end
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
- Backfill.run(args)
+ case Backfill.run(args) do
+ :ok ->
+ :ok
+
+ {:error, type}
+ when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
+ {:cancel, type}
+
+ {:error, type}
+ when type in [:get, :head] ->
+ {:error, type}
+
+ error ->
+ {:error, error}
+ end
end
@impl Oban.Worker
diff --git a/test/pleroma/web/rich_media/parser_test.exs b/test/pleroma/web/rich_media/parser_test.exs
@@ -20,7 +20,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end
test "doesn't just add a title" do
- assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp")
+ assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp")
end
test "parses ogp" do
@@ -96,7 +96,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end
test "returns error if getting page was not successful" do
- assert {:error, :overload} = Parser.parse("https://example.com/error")
+ assert {:error, :get} = Parser.parse("https://example.com/error")
end
test "does a HEAD request to check if the body is too large" do
@@ -104,17 +104,17 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end
test "does a HEAD request to check if the body is html" do
- assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file")
+ assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file")
end
test "refuses to crawl incomplete URLs" do
url = "example.com/ogp"
- assert :error == Parser.parse(url)
+ assert {:error, :validate} == Parser.parse(url)
end
test "refuses to crawl malformed URLs" do
url = "example.com[]/ogp"
- assert :error == Parser.parse(url)
+ assert {:error, :validate} == Parser.parse(url)
end
test "refuses to crawl URLs of private network from posts" do
@@ -126,7 +126,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
"https://pleroma.local/notice/9kCP7V"
]
|> Enum.each(fn url ->
- assert :error == Parser.parse(url)
+ assert {:error, :validate} == Parser.parse(url)
end)
end
diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex
@@ -1724,7 +1724,7 @@ defmodule HttpRequestMock do
]
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
- {:ok, %Tesla.Env{status: 404, body: ""}}
+ {:ok, %Tesla.Env{status: 200, body: ""}}
end
def head("https://example.com/pdf-file", _, _, _) do