Merge branch 'rich-media-cache' into 'develop' - pleroma - My custom branche(s) on git.pleroma.social/pleroma/pleroma

commit: e9573627792df4cdaea15f1ca1563594f477cd8e
parent 0b9990a7e53061439a7fa9dbe3e39e3ee22d1371
Author: feld <feld@feld.me>
Date:   Mon,  5 Feb 2024 05:58:07 +0000

Merge branch 'rich-media-cache' into 'develop'

Fix Rich Media Previews for updated activities

See merge request pleroma/pleroma!4052
Diffstat:
A changelog.d/rich_media.fix 1 +
M lib/pleroma/activity/html.ex 2 +-
M lib/pleroma/html.ex 31 +++++++++++--------------------
M lib/pleroma/web/rich_media/helpers.ex 27 ++++++++++++++++++++++++---
A test/fixtures/rich_media/google.html 12 ++++++++++++
A test/fixtures/rich_media/yahoo.html 12 ++++++++++++
M test/pleroma/web/rich_media/helpers_test.exs 40 +++++++++++++++++++++++++++++++++-------
M test/support/http_request_mock.ex 13 ++++++++++++-

8 files changed, 106 insertions(+), 32 deletions(-)
diff --git a/changelog.d/rich_media.fix b/changelog.d/rich_media.fix
@@ -0,0 +1 @@
+Rich Media Preview cache eviction when the activity is updated.
diff --git a/lib/pleroma/activity/html.ex b/lib/pleroma/activity/html.ex
@@ -28,7 +28,7 @@ defmodule Pleroma.Activity.HTML do
     end
   end
 
-  defp add_cache_key_for(activity_id, additional_key) do
+  def add_cache_key_for(activity_id, additional_key) do
     current = get_cache_keys_for(activity_id)
 
     unless additional_key in current do
diff --git a/lib/pleroma/html.ex b/lib/pleroma/html.ex
@@ -6,8 +6,6 @@ defmodule Pleroma.HTML do
   # Scrubbers are compiled on boot so they can be configured in OTP releases
   #  @on_load :compile_scrubbers
 
-  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
-
   def compile_scrubbers do
     dir = Path.join(:code.priv_dir(:pleroma), "scrubbers")
 
@@ -67,27 +65,20 @@ defmodule Pleroma.HTML do
     end
   end
 
-  def extract_first_external_url_from_object(%{data: %{"content" => content}} = object)
+  @spec extract_first_external_url_from_object(Pleroma.Object.t()) ::
+          {:ok, String.t()} | {:error, :no_content}
+  def extract_first_external_url_from_object(%{data: %{"content" => content}})
       when is_binary(content) do
-    unless object.data["fake"] do
-      key = "URL|#{object.id}"
+    url =
+      content
+      |> Floki.parse_fragment!()
+      |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
+      |> Enum.take(1)
+      |> Floki.attribute("href")
+      |> Enum.at(0)
 
-      @cachex.fetch!(:scrubber_cache, key, fn _key ->
-        {:commit, {:ok, extract_first_external_url(content)}}
-      end)
-    else
-      {:ok, extract_first_external_url(content)}
-    end
+    {:ok, url}
   end
 
   def extract_first_external_url_from_object(_), do: {:error, :no_content}
-
-  def extract_first_external_url(content) do
-    content
-    |> Floki.parse_fragment!()
-    |> Floki.find("a:not(.mention,.hashtag,.attachment,[rel~=\"tag\"])")
-    |> Enum.take(1)
-    |> Floki.attribute("href")
-    |> Enum.at(0)
-  end
 end
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex
@@ -8,6 +8,8 @@ defmodule Pleroma.Web.RichMedia.Helpers do
   alias Pleroma.Object
   alias Pleroma.Web.RichMedia.Parser
 
+  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
+
   @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
 
   @options [
@@ -25,9 +27,11 @@ defmodule Pleroma.Web.RichMedia.Helpers do
     |> parse_uri(page_url)
   end
 
-  defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
-       when is_binary(authority) do
+  defp validate_page_url(%URI{host: host, scheme: "https"}) do
     cond do
+      Linkify.Parser.ip?(host) ->
+        :error
+
       host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
         :error
 
@@ -71,7 +75,24 @@ defmodule Pleroma.Web.RichMedia.Helpers do
   def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
     with true <- @config_impl.get([:rich_media, :enabled]),
          %Object{} = object <- Object.normalize(activity, fetch: false) do
-      fetch_data_for_object(object)
+      if object.data["fake"] do
+        fetch_data_for_object(object)
+      else
+        key = "URL|#{activity.id}"
+
+        @cachex.fetch!(:scrubber_cache, key, fn _ ->
+          result = fetch_data_for_object(object)
+
+          cond do
+            match?(%{page_url: _, rich_media: _}, result) ->
+              Activity.HTML.add_cache_key_for(activity.id, key)
+              {:commit, result}
+
+            true ->
+              {:ignore, %{}}
+          end
+        end)
+      end
     else
       _ -> %{}
     end
diff --git a/test/fixtures/rich_media/google.html b/test/fixtures/rich_media/google.html
@@ -0,0 +1,12 @@
+<meta property="og:url" content="https://google.com">
+<meta property="og:type" content="website">
+<meta property="og:title" content="Google">
+<meta property="og:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
+<meta property="og:image" content="">
+
+<meta name="twitter:card" content="summary_large_image">
+<meta property="twitter:domain" content="google.com">
+<meta property="twitter:url" content="https://google.com">
+<meta name="twitter:title" content="Google">
+<meta name="twitter:description" content="Search the world's information, including webpages, images, videos and more. Google has many special features to help you find exactly what you're looking for.">
+<meta name="twitter:image" content="">
diff --git a/test/fixtures/rich_media/yahoo.html b/test/fixtures/rich_media/yahoo.html
@@ -0,0 +1,12 @@
+<meta property="og:url" content="https://yahoo.com">
+<meta property="og:type" content="website">
+<meta property="og:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
+<meta property="og:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
+<meta property="og:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">
+
+<meta name="twitter:card" content="summary_large_image">
+<meta property="twitter:domain" content="yahoo.com">
+<meta property="twitter:url" content="https://yahoo.com">
+<meta name="twitter:title" content="Yahoo | Mail, Weather, Search, Politics, News, Finance, Sports & Videos">
+<meta name="twitter:description" content="Latest news coverage, email, free stock quotes, live scores and video are just the beginning. Discover more every day at Yahoo!">
+<meta name="twitter:image" content="https://s.yimg.com/cv/apiv2/social/images/yahoo_default_logo.png">
diff --git a/test/pleroma/web/rich_media/helpers_test.exs b/test/pleroma/web/rich_media/helpers_test.exs
@@ -83,8 +83,34 @@ defmodule Pleroma.Web.RichMedia.HelpersTest do
              Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
   end
 
-  # This does not seem to work. The urls are being fetched.
-  @tag skip: true
+  test "recrawls URLs on updates" do
+    original_url = "https://google.com/"
+    updated_url = "https://yahoo.com/"
+
+    Pleroma.StaticStubbedConfigMock
+    |> stub(:get, fn
+      [:rich_media, :enabled] -> true
+      path -> Pleroma.Test.StaticConfig.get(path)
+    end)
+
+    user = insert(:user)
+    {:ok, activity} = CommonAPI.post(user, %{status: "I like this site #{original_url}"})
+
+    assert match?(
+             %{page_url: ^original_url, rich_media: _},
+             Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
+           )
+
+    {:ok, _} = CommonAPI.update(user, activity, %{status: "I like this site #{updated_url}"})
+
+    activity = Pleroma.Activity.get_by_id(activity.id)
+
+    assert match?(
+             %{page_url: ^updated_url, rich_media: _},
+             Pleroma.Web.RichMedia.Helpers.fetch_data_for_activity(activity)
+           )
+  end
+
   test "refuses to crawl URLs of private network from posts" do
     user = insert(:user)
 
@@ -102,10 +128,10 @@ defmodule Pleroma.Web.RichMedia.HelpersTest do
       path -> Pleroma.Test.StaticConfig.get(path)
     end)
 
-    assert %{} = Helpers.fetch_data_for_activity(activity)
-    assert %{} = Helpers.fetch_data_for_activity(activity2)
-    assert %{} = Helpers.fetch_data_for_activity(activity3)
-    assert %{} = Helpers.fetch_data_for_activity(activity4)
-    assert %{} = Helpers.fetch_data_for_activity(activity5)
+    assert %{} == Helpers.fetch_data_for_activity(activity)
+    assert %{} == Helpers.fetch_data_for_activity(activity2)
+    assert %{} == Helpers.fetch_data_for_activity(activity3)
+    assert %{} == Helpers.fetch_data_for_activity(activity4)
+    assert %{} == Helpers.fetch_data_for_activity(activity5)
   end
 end
diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex
@@ -1464,6 +1464,14 @@ defmodule HttpRequestMock do
      }}
   end
 
+  def get("https://google.com/", _, _, _) do
+    {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/google.html")}}
+  end
+
+  def get("https://yahoo.com/", _, _, _) do
+    {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/yahoo.html")}}
+  end
+
   def get(url, query, body, headers) do
     {:error,
      "Mock response not implemented for GET #{inspect(url)}, #{query}, #{inspect(body)}, #{inspect(headers)}"}
@@ -1539,7 +1547,10 @@ defmodule HttpRequestMock do
   @rich_media_mocks [
     "https://example.com/ogp",
     "https://example.com/ogp-missing-data",
-    "https://example.com/twitter-card"
+    "https://example.com/twitter-card",
+    "https://google.com/",
+    "https://yahoo.com/",
+    "https://pleroma.local/notice/9kCP7V"
   ]
   def head(url, _query, _body, _headers) when url in @rich_media_mocks do
     {:ok, %Tesla.Env{status: 404, body: ""}}

A	changelog.d/rich_media.fix	1	+
M	lib/pleroma/activity/html.ex	2	+-
M	lib/pleroma/html.ex	31	+++++++++++--------------------
M	lib/pleroma/web/rich_media/helpers.ex	27	++++++++++++++++++++++++---
A	test/fixtures/rich_media/google.html	12	++++++++++++
A	test/fixtures/rich_media/yahoo.html	12	++++++++++++
M	test/pleroma/web/rich_media/helpers_test.exs	40	+++++++++++++++++++++++++++++++++-------
M	test/support/http_request_mock.ex	13	++++++++++++-