Fulltext implementation incl. config (#2480)

This adds the main component of the fulltext search.
This PR doesn't do anything yet, besides creating an empty fulltextindex
folder if enabled. Indexing events is done in a separate PR.
This commit is contained in:
Till 2022-09-07 18:15:54 +02:00 committed by GitHub
parent 31f4ae8997
commit d5876abbe9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 647 additions and 2 deletions

164
internal/fulltext/bleve.go Normal file
View file

@ -0,0 +1,164 @@
// Copyright 2022 The Matrix.org Foundation C.I.C.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !wasm
// +build !wasm
package fulltext
import (
"strings"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/matrix-org/dendrite/setup/config"
"github.com/matrix-org/gomatrixserverlib"
)
// Search contains all existing bleve.Index
type Search struct {
FulltextIndex bleve.Index
}
// IndexElement describes the layout of an element to index
type IndexElement struct {
EventID string
RoomID string
Content string
ContentType string
StreamPosition int64
}
// SetContentType sets i.ContentType given an identifier
func (i *IndexElement) SetContentType(v string) {
switch v {
case "m.room.message":
i.ContentType = "content.body"
case gomatrixserverlib.MRoomName:
i.ContentType = "content.name"
case gomatrixserverlib.MRoomTopic:
i.ContentType = "content.topic"
}
}
// New opens a new/existing fulltext index
func New(cfg config.Fulltext) (fts *Search, err error) {
fts = &Search{}
fts.FulltextIndex, err = openIndex(cfg)
if err != nil {
return nil, err
}
return fts, nil
}
// Close closes the fulltext index
func (f *Search) Close() error {
return f.FulltextIndex.Close()
}
// Index indexes the given elements
func (f *Search) Index(elements ...IndexElement) error {
batch := f.FulltextIndex.NewBatch()
for _, element := range elements {
err := batch.Index(element.EventID, element)
if err != nil {
return err
}
}
return f.FulltextIndex.Batch(batch)
}
// Delete deletes an indexed element by the eventID
func (f *Search) Delete(eventID string) error {
return f.FulltextIndex.Delete(eventID)
}
// Search searches the index given a search term, roomIDs and keys.
func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (*bleve.SearchResult, error) {
qry := bleve.NewConjunctionQuery()
termQuery := bleve.NewBooleanQuery()
terms := strings.Split(term, " ")
for _, term := range terms {
matchQuery := bleve.NewMatchQuery(term)
matchQuery.SetField("Content")
termQuery.AddMust(matchQuery)
}
qry.AddQuery(termQuery)
roomQuery := bleve.NewBooleanQuery()
for _, roomID := range roomIDs {
roomSearch := bleve.NewMatchQuery(roomID)
roomSearch.SetField("RoomID")
roomQuery.AddShould(roomSearch)
}
if len(roomIDs) > 0 {
qry.AddQuery(roomQuery)
}
keyQuery := bleve.NewBooleanQuery()
for _, key := range keys {
keySearch := bleve.NewMatchQuery(key)
keySearch.SetField("ContentType")
keyQuery.AddShould(keySearch)
}
if len(keys) > 0 {
qry.AddQuery(keyQuery)
}
s := bleve.NewSearchRequestOptions(qry, limit, from, false)
s.Fields = []string{"*"}
s.SortBy([]string{"_score"})
if orderByStreamPos {
s.SortBy([]string{"-StreamPosition"})
}
return f.FulltextIndex.Search(s)
}
func openIndex(cfg config.Fulltext) (bleve.Index, error) {
m := getMapping(cfg)
if cfg.InMemory {
return bleve.NewMemOnly(m)
}
if index, err := bleve.Open(string(cfg.IndexPath)); err == nil {
return index, nil
}
index, err := bleve.New(string(cfg.IndexPath), m)
if err != nil {
return nil, err
}
return index, nil
}
func getMapping(cfg config.Fulltext) *mapping.IndexMappingImpl {
enFieldMapping := bleve.NewTextFieldMapping()
enFieldMapping.Analyzer = cfg.Language
eventMapping := bleve.NewDocumentMapping()
eventMapping.AddFieldMappingsAt("Content", enFieldMapping)
eventMapping.AddFieldMappingsAt("StreamPosition", bleve.NewNumericFieldMapping())
// Index entries as is
idFieldMapping := bleve.NewKeywordFieldMapping()
eventMapping.AddFieldMappingsAt("ContentType", idFieldMapping)
eventMapping.AddFieldMappingsAt("RoomID", idFieldMapping)
eventMapping.AddFieldMappingsAt("EventID", idFieldMapping)
indexMapping := bleve.NewIndexMapping()
indexMapping.AddDocumentMapping("Event", eventMapping)
indexMapping.DefaultType = "Event"
return indexMapping
}

View file

@ -0,0 +1,250 @@
// Copyright 2022 The Matrix.org Foundation C.I.C.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fulltext_test
import (
"reflect"
"testing"
"github.com/matrix-org/gomatrixserverlib"
"github.com/matrix-org/util"
"github.com/matrix-org/dendrite/internal/fulltext"
"github.com/matrix-org/dendrite/setup/config"
)
func mustOpenIndex(t *testing.T, tempDir string) *fulltext.Search {
t.Helper()
cfg := config.Fulltext{}
cfg.Defaults(config.DefaultOpts{
Generate: true,
Monolithic: true,
})
if tempDir != "" {
cfg.IndexPath = config.Path(tempDir)
cfg.InMemory = false
}
fts, err := fulltext.New(cfg)
if err != nil {
t.Fatal("failed to open fulltext index:", err)
}
return fts
}
func mustAddTestData(t *testing.T, fts *fulltext.Search, firstStreamPos int64) (eventIDs, roomIDs []string) {
t.Helper()
// create some more random data
var batchItems []fulltext.IndexElement
streamPos := firstStreamPos
wantRoomID := util.RandomString(16)
for i := 0; i < 30; i++ {
streamPos++
eventID := util.RandomString(16)
// Create more data for the first room
if i > 15 {
wantRoomID = util.RandomString(16)
}
e := fulltext.IndexElement{
EventID: eventID,
RoomID: wantRoomID,
Content: "lorem ipsum",
StreamPosition: streamPos,
}
e.SetContentType("m.room.message")
batchItems = append(batchItems, e)
roomIDs = append(roomIDs, wantRoomID)
eventIDs = append(eventIDs, eventID)
}
e := fulltext.IndexElement{
EventID: util.RandomString(16),
RoomID: wantRoomID,
Content: "Roomname testing",
StreamPosition: streamPos,
}
e.SetContentType(gomatrixserverlib.MRoomName)
batchItems = append(batchItems, e)
e = fulltext.IndexElement{
EventID: util.RandomString(16),
RoomID: wantRoomID,
Content: "Room topic fulltext",
StreamPosition: streamPos,
}
e.SetContentType(gomatrixserverlib.MRoomTopic)
batchItems = append(batchItems, e)
if err := fts.Index(batchItems...); err != nil {
t.Fatalf("failed to batch insert elements: %v", err)
}
return eventIDs, roomIDs
}
func TestOpen(t *testing.T) {
dataDir := t.TempDir()
fts := mustOpenIndex(t, dataDir)
if err := fts.Close(); err != nil {
t.Fatal("unable to close fulltext index", err)
}
// open existing index
fts = mustOpenIndex(t, dataDir)
defer fts.Close()
}
func TestIndex(t *testing.T) {
fts := mustOpenIndex(t, "")
defer fts.Close()
// add some data
var streamPos int64 = 1
roomID := util.RandomString(8)
eventID := util.RandomString(16)
e := fulltext.IndexElement{
EventID: eventID,
RoomID: roomID,
Content: "lorem ipsum",
StreamPosition: streamPos,
}
e.SetContentType("m.room.message")
if err := fts.Index(e); err != nil {
t.Fatal("failed to index element", err)
}
// create some more random data
mustAddTestData(t, fts, streamPos)
}
func TestDelete(t *testing.T) {
fts := mustOpenIndex(t, "")
defer fts.Close()
eventIDs, roomIDs := mustAddTestData(t, fts, 0)
res1, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false)
if err != nil {
t.Fatal(err)
}
if err = fts.Delete(eventIDs[0]); err != nil {
t.Fatal(err)
}
res2, err := fts.Search("lorem", roomIDs[:1], nil, 50, 0, false)
if err != nil {
t.Fatal(err)
}
if res1.Total <= res2.Total {
t.Fatalf("got unexpected result: %d <= %d", res1.Total, res2.Total)
}
}
func TestSearch(t *testing.T) {
type args struct {
term string
keys []string
limit int
from int
orderByStreamPos bool
roomIndex []int
}
tests := []struct {
name string
args args
wantCount int
wantErr bool
}{
{
name: "Can search for many results in one room",
wantCount: 16,
args: args{
term: "lorem",
roomIndex: []int{0},
limit: 20,
},
},
{
name: "Can search for one result in one room",
wantCount: 1,
args: args{
term: "lorem",
roomIndex: []int{16},
limit: 20,
},
},
{
name: "Can search for many results in multiple rooms",
wantCount: 17,
args: args{
term: "lorem",
roomIndex: []int{0, 16},
limit: 20,
},
},
{
name: "Can search for many results in all rooms, reversed",
wantCount: 30,
args: args{
term: "lorem",
limit: 30,
orderByStreamPos: true,
},
},
{
name: "Can search for specific search room name",
wantCount: 1,
args: args{
term: "testing",
roomIndex: []int{},
limit: 20,
keys: []string{"content.name"},
},
},
{
name: "Can search for specific search room topic",
wantCount: 1,
args: args{
term: "fulltext",
roomIndex: []int{},
limit: 20,
keys: []string{"content.topic"},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
f := mustOpenIndex(t, "")
eventIDs, roomIDs := mustAddTestData(t, f, 0)
var searchRooms []string
for _, x := range tt.args.roomIndex {
searchRooms = append(searchRooms, roomIDs[x])
}
t.Logf("searching in rooms: %v - %v\n", searchRooms, tt.args.keys)
got, err := f.Search(tt.args.term, searchRooms, tt.args.keys, tt.args.limit, tt.args.from, tt.args.orderByStreamPos)
if (err != nil) != tt.wantErr {
t.Errorf("Search() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(len(got.Hits), tt.wantCount) {
t.Errorf("Search() got = %v, want %v", len(got.Hits), tt.wantCount)
}
if tt.args.orderByStreamPos {
if got.Hits[0].ID != eventIDs[29] {
t.Fatalf("expected ID %s, got %s", eventIDs[29], got.Hits[0].ID)
}
}
})
}
}

View file

@ -0,0 +1,65 @@
// Copyright 2022 The Matrix.org Foundation C.I.C.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fulltext
import (
"github.com/matrix-org/dendrite/setup/config"
"time"
)
type Search struct{}
type IndexElement struct {
EventID string
RoomID string
Content string
ContentType string
StreamPosition int64
}
type SearchResult struct {
Status interface{} `json:"status"`
Request *interface{} `json:"request"`
Hits []interface{} `json:"hits"`
Total uint64 `json:"total_hits"`
MaxScore float64 `json:"max_score"`
Took time.Duration `json:"took"`
Facets interface{} `json:"facets"`
}
func (i *IndexElement) SetContentType(v string) {}
func New(cfg config.Fulltext) (fts *Search, err error) {
return &Search{}, nil
}
func (f *Search) Close() error {
return nil
}
func (f *Search) Index(e IndexElement) error {
return nil
}
func (f *Search) BatchIndex(elements []IndexElement) error {
return nil
}
func (f *Search) Delete(eventID string) error {
return nil
}
func (f *Search) Search(term string, roomIDs, keys []string, limit, from int, orderByStreamPos bool) (SearchResult, error) {
return SearchResult{}, nil
}