commit: 3dedadf192a3acd0c1dfc2b11eba5a247ae7f61c
parent 9f16ca80e0fe60b8b0e3e8ddb9b06ca0bec31002
Author: Ekaterina Vaartis <vaartis@kotobank.ch>
Date: Sun, 22 Aug 2021 19:38:03 +0300
Adjust content indexing to skip more unneeded stuff
Diffstat:
1 file changed, 31 insertions(+), 16 deletions(-)
diff --git a/lib/mix/tasks/pleroma/search/meilisearch.ex b/lib/mix/tasks/pleroma/search/meilisearch.ex
@@ -52,13 +52,6 @@ defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
timeout: :infinity
)
|> Stream.chunk_every(chunk_size)
- |> Stream.transform(0, fn objects, acc ->
- new_acc = acc + Enum.count(objects)
-
- IO.puts("Indexed #{new_acc} entries")
-
- {[objects], new_acc}
- end)
|> Stream.map(fn objects ->
Enum.map(objects, fn object ->
data = object.data
@@ -70,15 +63,34 @@ defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
end
{:ok, published, _} = DateTime.from_iso8601(data["published"])
- {:ok, content} = FastSanitize.strip_tags(content_str)
-
- %{
- id: object.id,
- content: content,
- ap: data["id"],
- published: published |> DateTime.to_unix()
- }
+
+ content =
+ with {:ok, scrubbed} <- FastSanitize.strip_tags(content_str),
+ trimmed <- String.trim(scrubbed) do
+ trimmed
+ end
+
+ # Only index if there is anything in the string. If there is a single symbol,
+ # it's probably a dot from mastodon posts with just the picture
+ if String.length(content) > 1 do
+ %{
+ id: object.id,
+ content: content,
+ ap: data["id"],
+ published: published |> DateTime.to_unix()
+ }
+ else
+ nil
+ end
end)
+ |> Enum.filter(fn o -> not is_nil(o) end)
+ end)
+ |> Stream.transform(0, fn objects, acc ->
+ new_acc = acc + Enum.count(objects)
+
+ IO.puts("Indexed #{new_acc} entries")
+
+ {[objects], new_acc}
end)
|> Stream.each(fn objects ->
{:ok, result} =
@@ -102,6 +114,9 @@ defmodule Mix.Tasks.Pleroma.Search.Meilisearch do
endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url])
- {:ok, _} = Pleroma.HTTP.request(:delete, "#{endpoint}/indexes/objects", "", [], [])
+ {:ok, _} =
+ Pleroma.HTTP.request(:delete, "#{endpoint}/indexes/objects/documents", "", [],
+ timeout: :infinity
+ )
end
end