diff --git a/src/moin/app.py b/src/moin/app.py index 3590dea92..7270bf0ef 100644 --- a/src/moin/app.py +++ b/src/moin/app.py @@ -169,16 +169,8 @@ class ItemNameConverter(PathConverter): clock.stop("create_app flask-cache") # init storage clock.start("create_app init backends") - try: - init_backends(app) - except EmptyIndexError: - # create-instance has no index at start and index-* subcommands check the index individually - if info_name not in ["create-instance", "build-instance"] and not info_name.startswith("index-"): - clock.stop("create_app init backends") - clock.stop("create_app total") - logging.error("Error: Wiki index not found. Try 'moin help' or 'moin --help' to get further information.") - raise SystemExit(1) - logging.debug("Wiki index not found.") + # start init_backends + _init_backends(app, info_name, clock) clock.stop("create_app init backends") clock.start("create_app flask-babel") i18n_init(app) @@ -212,6 +204,32 @@ def destroy_app(app): deinit_backends(app) +def _init_backends(app, info_name, clock): + """ + initialize the backends with exception handling + """ + try: + init_backends(app) + except EmptyIndexError: + # create-instance has no index at start and index-* subcommands check the index individually + if info_name not in ["create-instance", "build-instance"] and not info_name.startswith("index-"): + missing_indexes = app.storage.missing_index_check() + if missing_indexes == "all": + logging.error( + "Error: all wiki indexes missing. Try 'moin help' or 'moin --help' to get further information." + ) + elif missing_indexes == "'latest_meta'": # TODO: remove this check after 6-12 month + logging.error( + "Error: Wiki index 'latest_meta' missing. Please see https://github.com/moinwiki/moin/pull/1877" + ) + else: + logging.error(f"Error: Wiki index {missing_indexes} missing, please check.") + clock.stop("create_app init backends") + clock.stop("create_app total") + raise SystemExit(1) + logging.debug("Wiki index not found.") + + def init_backends(app, create_backend=False): """ initialize the backends diff --git a/src/moin/apps/frontend/views.py b/src/moin/apps/frontend/views.py index a40d10101..5ac80827f 100644 --- a/src/moin/apps/frontend/views.py +++ b/src/moin/apps/frontend/views.py @@ -585,6 +585,8 @@ def wrapper(item_name, rev): abort(404, item_name) if add_trail: flaskg.user.add_trail(item_name, aliases=item.meta.revision.fqnames) + """if view has been called with default rev=CURRENT we can avoid an index query in flash_if_item_deleted""" + item.is_current = rev == CURRENT return wrapped(item) return wrapper @@ -602,6 +604,7 @@ def flash_if_item_deleted(item_name, rev_id, itemrev): Show flash info message if target item is deleted, show another message if revision is deleted. Return True if item is deleted or this revision is deleted. """ + rev_id = CURRENT if getattr(itemrev, "is_current", False) else rev_id if not rev_id == CURRENT: ret = False current_item = Item.create(item_name, rev_id=CURRENT) @@ -1503,13 +1506,13 @@ def name_initial(files, uppercase=False, lowercase=False): # request.args is a MultiDict instance, which degenerates into a normal # single-valued dict on most occasions (making the first value the *only* # value for a specific key) unless explicitly told to expose multiple - # values, eg. calling items with multi=True. See Werkzeug documentation for + # values, e.g. calling items with multi=True. See Werkzeug documentation for # more. form = IndexForm.from_flat(request.args.items(multi=True)) selected_groups = form["contenttype"].value startswith = request.values.get("startswith") - dirs, files = item.get_index(startswith, selected_groups) + dirs, files = item.get_index(startswith, selected_groups, short=True) dirs_fullname = [x.fullname for x in dirs] initials = request.values.get("initials") if initials: diff --git a/src/moin/cli/_tests/__init__.py b/src/moin/cli/_tests/__init__.py index 4d4ca8c35..ce3647828 100644 --- a/src/moin/cli/_tests/__init__.py +++ b/src/moin/cli/_tests/__init__.py @@ -16,6 +16,7 @@ from moin._tests import get_dirs from moin import log +from moin.constants.keys import ALL_REVS, LATEST_META logging = log.getLogger(__name__) @@ -131,7 +132,9 @@ def read_index_dump(out: str, latest=False): if item: yield item item = {} - if latest and "all_revs" in line: + if latest and ALL_REVS in line: + break + if LATEST_META in line: break continue space_index = line.index(" ") diff --git a/src/moin/cli/maint/index.py b/src/moin/cli/maint/index.py index e3a9ac88c..0ce8cabf5 100644 --- a/src/moin/cli/maint/index.py +++ b/src/moin/cli/maint/index.py @@ -13,7 +13,7 @@ from flask.cli import FlaskGroup from moin.app import create_app, init_backends -from moin.constants.keys import LATEST_REVS, ALL_REVS +from moin.constants.keys import LATEST_REVS, ALL_REVS, LATEST_META from moin.utils.filesys import wiki_index_exists @@ -134,7 +134,7 @@ def IndexDump(tmp, truncate): logging.error(ERR_NO_INDEX) raise SystemExit(1) logging.info("Index dump started") - for idx_name in [LATEST_REVS, ALL_REVS]: + for idx_name in [LATEST_REVS, ALL_REVS, LATEST_META]: print(f" {'-' * 10} {idx_name} {'-' * 60}") for kvs in app.storage.dump(tmp=tmp, idx_name=idx_name): for k, v in kvs: diff --git a/src/moin/constants/keys.py b/src/moin/constants/keys.py index dcad2eb97..b54414e03 100644 --- a/src/moin/constants/keys.py +++ b/src/moin/constants/keys.py @@ -170,6 +170,7 @@ # index names LATEST_REVS = "latest_revs" ALL_REVS = "all_revs" +LATEST_META = "latest_meta" # values for ACTION key ACTION_SAVE = "SAVE" diff --git a/src/moin/items/__init__.py b/src/moin/items/__init__.py index cc9a8faec..c62e49832 100644 --- a/src/moin/items/__init__.py +++ b/src/moin/items/__init__.py @@ -83,6 +83,7 @@ TAGS, TEMPLATE, LATEST_REVS, + LATEST_META, EDIT_ROWS, FQNAMES, USERGROUP, @@ -1186,7 +1187,6 @@ def handle_variables(self, data, meta): @rtype: string @return: new text of wikipage, variables replaced """ - logging.debug(f"handle_variable data: {data!r}") if self.contenttype not in CONTENTTYPE_VARIABLES: return data if "@" not in data: @@ -1196,6 +1196,7 @@ def handle_variables(self, data, meta): if TEMPLATE in meta["tags"]: return data + logging.debug(f"handle_variable data: {data!r}") # log only if necessary item_name = request.path.split("/", 2)[-1] signature = flaskg.user.name0 if flaskg.user.valid else request.remote_addr @@ -1356,7 +1357,7 @@ def build_index_query(self, startswith=None, selected_groups=None, isglobalindex return query - def get_index(self, startswith=None, selected_groups=None, regex=None): + def get_index(self, startswith=None, selected_groups=None, regex=None, short=False): """ Get index enties for descendents of the matching items @@ -1371,11 +1372,12 @@ def get_index(self, startswith=None, selected_groups=None, regex=None): - one for "dirs" (direct descendents that also contain descendents) """ fqname = self.fqname + idx_name = LATEST_META if short else LATEST_REVS isglobalindex = not fqname.value or fqname.value == NAMESPACE_ALL query = self.build_index_query(startswith, selected_groups, isglobalindex) if not fqname.value.startswith(NAMESPACE_ALL + "/") and fqname.value != NAMESPACE_ALL: query = Term(NAMESPACE, fqname.namespace) & query - revs = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=NAME_EXACT, limit=None, regex=regex) + revs = flaskg.storage.search_meta(query, idx_name=idx_name, sortedby=NAME_EXACT, limit=None, regex=regex) return self.make_flat_index(revs, isglobalindex) diff --git a/src/moin/macros/ItemList.py b/src/moin/macros/ItemList.py index c02daaf0f..62f1be968 100644 --- a/src/moin/macros/ItemList.py +++ b/src/moin/macros/ItemList.py @@ -131,7 +131,7 @@ def macro(self, content, arguments, page_url, alternative): item = "" # verify item exists and current user has read permission elif item != "": - if not flaskg.storage.get_item(**(split_fqname(item).query)): + if not flaskg.storage.get_item(short=True, **(split_fqname(item).query)): err_msg = _("Item does not exist or read access blocked by ACLs: {0}").format(item) return fail_message(err_msg, alternative) diff --git a/src/moin/macros/_base.py b/src/moin/macros/_base.py index 5bee33157..7f79c84a9 100644 --- a/src/moin/macros/_base.py +++ b/src/moin/macros/_base.py @@ -49,7 +49,7 @@ def get_item_names(name="", startswith="", kind="files", skiptag="", tag="", reg item = Item.create(name) except AccessDenied: abort(403) - dirs, files = item.get_index(startswith, regex=regex) + dirs, files = item.get_index(startswith, regex=regex, short=True) item_names = [] if not kind or kind == "files" or kind == "both": for item in files: diff --git a/src/moin/storage/middleware/indexing.py b/src/moin/storage/middleware/indexing.py index b52712203..90e20175b 100644 --- a/src/moin/storage/middleware/indexing.py +++ b/src/moin/storage/middleware/indexing.py @@ -91,7 +91,7 @@ WHOOSH_FILESTORAGE = "FileStorage" -INDEXES = [LATEST_REVS, ALL_REVS] +INDEXES = [LATEST_REVS, ALL_REVS, LATEST_META] VALIDATION_HANDLING_STRICT = "strict" VALIDATION_HANDLING_WARN = "warn" @@ -148,7 +148,7 @@ def search_names(name_prefix, limit=None): :return: item names list """ - idx_name = LATEST_REVS + idx_name = LATEST_META q = Prefix(NAME_EXACT, name_prefix) with flaskg.storage.indexer.ix[idx_name].searcher() as searcher: results = searcher.search(q, limit=limit) @@ -173,7 +173,8 @@ def backend_to_index(meta, content, schema, backend_name): # we have UNIX UTC timestamp (int), whoosh wants datetime doc[key] = utcfromtimestamp(doc[key]) doc[NAME_EXACT] = doc[NAME] - doc[CONTENT] = content + if CONTENT in schema: + doc[CONTENT] = content doc[BACKENDNAME] = backend_name if CONTENTNGRAM in schema: doc[CONTENTNGRAM] = content @@ -190,9 +191,11 @@ def backend_to_index(meta, content, schema, backend_name): doc[NAMES] = " | ".join(fullnames) else: doc[NAMES] = " | ".join(doc[NAME]) - doc[NAME_SORT] = doc[NAMES].replace("/", "") + doc_name_sort = doc[NAMES].replace("/", "") else: - doc[NAME_SORT] = "" + doc_name_sort = "" + if NAME_SORT in schema: + doc[NAME_SORT] = doc_name_sort return doc @@ -443,12 +446,37 @@ def __init__(self, index_storage, backend, acl_rights_contents=[], **kw): all_revs_fields = {ITEMID: ID(stored=True)} all_revs_fields.update(**common_fields) + # Small index for the latest revisions, used for queries such as has_item, authorization checks and + # the +index route. This index has no content or *NGRAMS, which improves query speed for large wikis + latest_meta_fields = { + # ITEMID from metadata - as there is only latest rev of same item here, it is unique + ITEMID: ID(unique=True, stored=True), + NAMESPACE: ID(stored=True), + NAME: TEXT(stored=True), + NAMES: TEXT(stored=True, multitoken_query="or", analyzer=item_name_analyzer(), field_boost=30.0), + NAME_EXACT: ID(field_boost=1.0), + REVID: ID(unique=True, stored=True), + REV_NUMBER: NUMERIC(stored=True), + PARENTID: ID(stored=True), + BACKENDNAME: ID(stored=True), + MTIME: DATETIME(stored=True), + ITEMTYPE: ID(stored=True), + CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), + USERID: ID(stored=True), + ADDRESS: ID(stored=True), + HOSTNAME: ID(stored=True), + SIZE: NUMERIC(stored=True), + ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), + } + latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) + latest_index_schema = Schema(**latest_meta_fields) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema + self.schemas[LATEST_META] = latest_index_schema # Define dynamic fields dynamic_fields = [ @@ -511,6 +539,15 @@ def open(self): for name in INDEXES: self.ix[name] = storage.open_index(name) + def missing_index_check(self): + """ + check existence of all indexes. + return: "all" or string with list of missing indexes + """ + storage = self.get_storage() + missing_indexes = [name for name in INDEXES if not storage.index_exists(name)] + return "all" if len(missing_indexes) == len(INDEXES) else str(missing_indexes)[1:-1] + def close(self): """ Close all indexes. @@ -581,31 +618,23 @@ def index_revision(self, meta, content, backend_name, async_=True, force_latest= == doc[REVID] ) if is_latest: - doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], backend_name) - if async_: - writer = AsyncWriter(self.ix[LATEST_REVS]) - else: - writer = self.ix[LATEST_REVS].writer() - with writer as writer: - writer.update_document(**doc) + for idx_name in [LATEST_REVS, LATEST_META]: + doc = backend_to_index(meta, content, self.schemas[idx_name], backend_name) + if async_: + writer = AsyncWriter(self.ix[idx_name]) + else: + writer = self.ix[idx_name].writer() + with writer as writer: + writer.update_document(**doc) - def remove_revision(self, revid, async_=True): - """ - Remove a single revision from indexes. - """ + def remove_index_revision(self, revid, async_=True, idx_name=LATEST_REVS): if async_: - writer = AsyncWriter(self.ix[ALL_REVS]) + writer = AsyncWriter(self.ix[idx_name]) else: - writer = self.ix[ALL_REVS].writer() - with writer as writer: - writer.delete_by_term(REVID, revid) - if async_: - writer = AsyncWriter(self.ix[LATEST_REVS]) - else: - writer = self.ix[LATEST_REVS].writer() + writer = self.ix[idx_name].writer() with writer as writer: # find out itemid related to the revid we want to remove: - with self.ix[LATEST_REVS].searcher() as searcher: + with self.ix[idx_name].searcher() as searcher: docnum_remove = searcher.document_number(revid=revid) if docnum_remove is not None: itemid = searcher.stored_fields(docnum_remove)[ITEMID] @@ -616,7 +645,7 @@ def remove_revision(self, revid, async_=True): # we have a latest revision, just update the document in the index: assert len(latest_backends_revids) == 1 # this item must have only one latest revision latest_backend_revid = latest_backends_revids[0] - # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS + # we must fetch from backend because schema for idx_name is different than for ALL_REVS # (and we can't be sure we have all fields stored, too) meta, _ = self.backend.retrieve(*latest_backend_revid) # we only use meta (not data), because we do not want to transform data->content again (this @@ -624,14 +653,25 @@ def remove_revision(self, revid, async_=True): with self.ix[ALL_REVS].searcher() as searcher: doc = searcher.document(revid=latest_backend_revid[1]) content = doc[CONTENT] - doc = backend_to_index( - meta, content, self.schemas[LATEST_REVS], backend_name=latest_backend_revid[0] - ) + doc = backend_to_index(meta, content, self.schemas[idx_name], backend_name=latest_backend_revid[0]) writer.update_document(**doc) else: # this is no revision left in this item that could be the new "latest rev", just kill the rev writer.delete_document(docnum_remove) + def remove_revision(self, revid, async_=True): + """ + Remove a single revision from indexes. + """ + if async_: + writer = AsyncWriter(self.ix[ALL_REVS]) + else: + writer = self.ix[ALL_REVS].writer() + with writer as writer: + writer.delete_by_term(REVID, revid) + for idx_name in [LATEST_REVS, LATEST_META]: + self.remove_index_revision(revid, async_=async_, idx_name=idx_name) + def _modify_index(self, index, schema, revids, mode="add", procs=None, limitmb=None, multisegment=False): """ modify index contents - add, update, delete the indexed documents for all given revids @@ -706,20 +746,21 @@ def rebuild(self, tmp=False, procs=None, limitmb=None, multisegment=False): finally: index.close() - # now build the index of the latest revisions: - index = storage.open_index(LATEST_REVS) - try: - self._modify_index( - index, - self.schemas[LATEST_REVS], - latest_backends_revids, - "add", - procs=procs, - limitmb=limitmb, - multisegment=multisegment, - ) - finally: - index.close() + # now build the indexes for latest revisions: + for idx_name in [LATEST_REVS, LATEST_META]: + index = storage.open_index(idx_name) + try: + self._modify_index( + index, + self.schemas[idx_name], + latest_backends_revids, + "add", + procs=procs, + limitmb=limitmb, + multisegment=multisegment, + ) + finally: + index.close() def update(self, tmp=False): """ @@ -757,18 +798,21 @@ def update(self, tmp=False): backend_latest_backends_revids = set(self._find_latest_backends_revids(index_all)) finally: index_all.close() - index_latest = storage.open_index(LATEST_REVS) - try: - # now update LATEST_REVS index: - with index_latest.searcher() as searcher: - ix_revids = {doc[REVID] for doc in searcher.all_stored_fields()} - backend_latest_revids = {revid for name, revid in backend_latest_backends_revids} - upd_revids = backend_latest_revids - ix_revids - upd_revids = [(revids_backends[revid], revid) for revid in upd_revids] - self._modify_index(index_latest, self.schemas[LATEST_REVS], upd_revids, "update") - self._modify_index(index_latest, self.schemas[LATEST_REVS], del_revids, "delete") - finally: - index_latest.close() + + # update LATEST_REVS and LATEST_META + for idx_name in [LATEST_REVS, LATEST_META]: + index_latest = storage.open_index(idx_name) + try: + with index_latest.searcher() as searcher: + ix_revids = {doc[REVID] for doc in searcher.all_stored_fields()} + backend_latest_revids = {revid for name, revid in backend_latest_backends_revids} + upd_revids = backend_latest_revids - ix_revids + upd_revids = [(revids_backends[revid], revid) for revid in upd_revids] + self._modify_index(index_latest, self.schemas[idx_name], upd_revids, "update") + self._modify_index(index_latest, self.schemas[idx_name], del_revids, "delete") + finally: + index_latest.close() + return changed def optimize_backend(self): @@ -944,16 +988,21 @@ def document(self, idx_name=LATEST_REVS, **kw): item = Item(self, latest_doc=latest_doc, itemid=doc[ITEMID]) return item.get_revision(doc[REVID], doc=doc) - def _document(self, idx_name=LATEST_REVS, **kw): + def _document(self, idx_name=LATEST_REVS, short=False, **kw): """ Return a document matching the kw args (internal use only). """ + if short: + idx_name = LATEST_META with self.ix[idx_name].searcher() as searcher: return searcher.document(**kw) def has_item(self, name): - # TODO: Add fqname support to this method - item = self[name] + if name.startswith("@itemid/"): + item = Item(self, short=True, **{ITEMID: name[8:]}) + else: + fqname = split_fqname(name) + item = Item(self, short=True, **{NAME_EXACT: fqname.value, NAMESPACE: fqname.namespace}) return bool(item) def __getitem__(self, name): @@ -965,14 +1014,14 @@ def __getitem__(self, name): fqname = split_fqname(name) return Item(self, **{NAME_EXACT: fqname.value, NAMESPACE: fqname.namespace}) - def get_item(self, **query): + def get_item(self, short=False, **query): """ Return item identified by the query (may be a new or existing item). :kwargs query: e.g. name_exact="Foo" or itemid="..." or ... (must be a unique fieldname=value for the latest-revs index) """ - return Item(self, **query) + return Item(self, short=short, **query) def create_item(self, **query): """ @@ -1079,7 +1128,7 @@ def mtime(self): class Item(PropertiesMixin): - def __init__(self, indexer, latest_doc=None, **query): + def __init__(self, indexer, latest_doc=None, short=False, **query): """ :param indexer: indexer middleware instance :param latest_doc: if caller already has a latest-revs index whoosh document @@ -1094,7 +1143,7 @@ def __init__(self, indexer, latest_doc=None, **query): self._name = query.get(NAME_EXACT) if latest_doc is None: # we need to call the method without acl check to avoid endless recursion: - latest_doc = self.indexer._document(**query) + latest_doc = self.indexer._document(short=short, **query) if latest_doc is None: # no such item, create a dummy doc that has a NAME entry to # avoid issues in the name(s) property code. if this was a @@ -1127,7 +1176,7 @@ def parentids(self): """ parent_ids = set() for parent_name in self.parentnames: - rev = self.indexer._document(idx_name=LATEST_REVS, **{NAME_EXACT: parent_name}) + rev = self.indexer._document(idx_name=LATEST_META, **{NAME_EXACT: parent_name}) if rev: parent_ids.add(rev[ITEMID]) return parent_ids diff --git a/src/moin/storage/middleware/protecting.py b/src/moin/storage/middleware/protecting.py index 4a9509402..ea90ba434 100644 --- a/src/moin/storage/middleware/protecting.py +++ b/src/moin/storage/middleware/protecting.py @@ -153,7 +153,7 @@ def _get_acls(self, itemid=None, fqname=None): item = None if not meta_available or self._get_configured_acls(fqname)["hierarchic"]: """self.meta is not valid or namespace uses hierarchic acls and we need item parentids""" - item = self.get_item(**q) + item = self.get_item(short=True, **q) acl = item.acl fqname = item.fqname if acl is not None: @@ -301,8 +301,8 @@ def __getitem__(self, name): item = self.indexer[name] return ProtectedItem(self, item) - def get_item(self, **query): - item = self.indexer.get_item(**query) + def get_item(self, short=False, **query): + item = self.indexer.get_item(short=short, **query) return ProtectedItem(self, item) def create_item(self, **query): @@ -326,7 +326,7 @@ def may(self, fqname, capability, usernames=None, item=None): if item: item = ProtectedItem(self, item) else: - item = self.get_item(**fqname.query) + item = self.get_item(short=True, **fqname.query) allowed = item.allows(capability, user_names=usernames) return allowed