From cd7e2138d11901fc7a0c8c2f22b7a5d57383a555 Mon Sep 17 00:00:00 2001 From: Lain Soykaf Date: Tue, 14 May 2024 14:13:37 +0400 Subject: [PATCH] Search: Basic Qdrant/Ollama search --- config/config.exs | 9 ++ lib/mix/tasks/pleroma/search/indexer.ex | 60 ++++++++++++ lib/pleroma/search/qdrant_search.ex | 117 ++++++++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 lib/mix/tasks/pleroma/search/indexer.ex create mode 100644 lib/pleroma/search/qdrant_search.ex diff --git a/config/config.exs b/config/config.exs index b69044a2b..f74eda6b2 100644 --- a/config/config.exs +++ b/config/config.exs @@ -915,6 +915,15 @@ config :pleroma, Pleroma.Application, config :pleroma, Pleroma.Uploaders.Uploader, timeout: 30_000 +config :pleroma, Pleroma.Search.QdrantSearch, + qdrant_url: "http://127.0.0.1:6333/", + qdrant_api_key: nil, + ollama_url: "http://127.0.0.1:11434", + ollama_model: "snowflake-arctic-embed:xs", + qdrant_index_configuration: %{ + vectors: %{size: 384, distance: "Cosine"} + } + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{Mix.env()}.exs" diff --git a/lib/mix/tasks/pleroma/search/indexer.ex b/lib/mix/tasks/pleroma/search/indexer.ex new file mode 100644 index 000000000..ffa2f3c94 --- /dev/null +++ b/lib/mix/tasks/pleroma/search/indexer.ex @@ -0,0 +1,60 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Mix.Tasks.Pleroma.Search.Indexer do + import Mix.Pleroma + import Ecto.Query + + alias Pleroma.Workers.SearchIndexingWorker + + def run(["index" | options]) do + {options, [], []} = + OptionParser.parse( + options, + strict: [ + limit: :integer + ] + ) + + start_pleroma() + + limit = Keyword.get(options, :limit, 100_000) + + per_step = 1000 + chunks = max(div(limit, per_step), 1) + + 1..chunks + |> Enum.each(fn step -> + q = + from(a in Pleroma.Activity, + limit: ^per_step, + offset: ^per_step * (^step - 1), + select: [:id], + order_by: [desc: :id] + ) + + {:ok, ids} = + Pleroma.Repo.transaction(fn -> + Pleroma.Repo.stream(q, timeout: :infinity) + |> Enum.map(fn a -> + a.id + end) + end) + + IO.puts("Got #{length(ids)} activities, adding to indexer") + + ids + |> Enum.chunk_every(100) + |> Enum.each(fn chunk -> + IO.puts("Adding #{length(chunk)} activities to indexing queue") + + chunk + |> Enum.map(fn id -> + SearchIndexingWorker.new(%{"op" => "add_to_index", "activity" => id}) + end) + |> Oban.insert_all() + end) + end) + end +end diff --git a/lib/pleroma/search/qdrant_search.ex b/lib/pleroma/search/qdrant_search.ex new file mode 100644 index 000000000..fcaa9e686 --- /dev/null +++ b/lib/pleroma/search/qdrant_search.ex @@ -0,0 +1,117 @@ +defmodule Pleroma.Search.QdrantSearch do + @behaviour Pleroma.Search.SearchBackend + import Ecto.Query + alias Pleroma.Activity + + alias __MODULE__.QdrantClient + alias __MODULE__.OllamaClient + + import Pleroma.Search.Meilisearch, only: [object_to_search_data: 1] + + def initialize_index() do + payload = Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_index_configuration]) + QdrantClient.put("/collections/posts", payload) + end + + def drop_index() do + QdrantClient.delete("/collections/posts") + end + + def get_embedding(text) do + with {:ok, %{body: %{"embedding" => embedding}}} <- + OllamaClient.post("/api/embeddings", %{ + prompt: text, + model: Pleroma.Config.get([Pleroma.Search.QdrantSearch, :ollama_model]) + }) + |> IO.inspect() do + {:ok, embedding} + else + _ -> + {:error, "Failed to get embedding"} + end + end + + defp build_index_payload(activity, embedding) do + %{ + points: [ + %{ + id: activity.id |> FlakeId.from_string() |> Ecto.UUID.cast!(), + vector: embedding + } + ] + } + end + + defp build_search_payload(embedding) do + %{ + vector: embedding, + limit: 20 + } + end + + @impl true + def add_to_index(activity) do + # This will only index public or unlisted notes + maybe_search_data = object_to_search_data(activity.object) + IO.puts("TRYING TO INDEX\n\n") + + if activity.data["type"] == "Create" and maybe_search_data do + with {:ok, embedding} <- get_embedding(maybe_search_data.content), + {:ok, %{status: 200}} <- + QdrantClient.put( + "/collections/posts/points", + build_index_payload(activity, embedding) + ) do + :ok + else + e -> {:error, e} + end + else + :ok + end + end + + @impl true + def search(_user, query, _options) do + with {:ok, embedding} <- get_embedding(query), + {:ok, %{body: %{"result" => result}}} <- + QdrantClient.post("/collections/posts/points/search", build_search_payload(embedding)) do + ids = + Enum.map(result, fn %{"id" => id} -> + Ecto.UUID.dump!(id) + end) + + from(a in Activity, where: a.id in ^ids) + |> Activity.with_preloaded_object() + |> Activity.restrict_deactivated_users() + |> Ecto.Query.order_by([a], fragment("array_position(?, ?)", ^ids, a.id)) + |> Pleroma.Repo.all() + else + _ -> + [] + end + end + + @impl true + def remove_from_index(_object) do + :ok + end +end + +defmodule Pleroma.Search.QdrantSearch.OllamaClient do + use Tesla + + plug(Tesla.Middleware.BaseUrl, Pleroma.Config.get([Pleroma.Search.QdrantSearch, :ollama_url])) + plug(Tesla.Middleware.JSON) +end + +defmodule Pleroma.Search.QdrantSearch.QdrantClient do + use Tesla + + plug(Tesla.Middleware.BaseUrl, Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_url])) + plug(Tesla.Middleware.JSON) + + plug(Tesla.Middleware.Headers, [ + {"api-key", Pleroma.Config.get([Pleroma.Search.QdrantSearch, :qdrant_api_key])} + ]) +end