From 280a61c8dc4fd39bb2b14b8cd2a0641863397cb1 Mon Sep 17 00:00:00 2001 From: anilb Date: Mon, 9 Oct 2023 17:42:33 +0200 Subject: [PATCH 1/7] better pagination in org merge suggestions, convenience script, few edge case fixes --- .../bin/scripts/generate-merge-suggestions.ts | 61 ++++++++++++++ .../repositories/organizationRepository.ts | 81 ++++++++++++++----- backend/src/services/organizationService.ts | 13 +++ 3 files changed, 135 insertions(+), 20 deletions(-) create mode 100644 backend/src/bin/scripts/generate-merge-suggestions.ts diff --git a/backend/src/bin/scripts/generate-merge-suggestions.ts b/backend/src/bin/scripts/generate-merge-suggestions.ts new file mode 100644 index 0000000000..4fa2854772 --- /dev/null +++ b/backend/src/bin/scripts/generate-merge-suggestions.ts @@ -0,0 +1,61 @@ +import commandLineArgs from 'command-line-args' +import commandLineUsage from 'command-line-usage' +import * as fs from 'fs' +import path from 'path' +import { sendNodeWorkerMessage } from '../../serverless/utils/nodeWorkerSQS' +import { NodeWorkerMessageType } from '../../serverless/types/workerTypes' +import { NodeWorkerMessageBase } from '@/types/mq/nodeWorkerMessageBase' + +/* eslint-disable no-console */ + +const banner = fs.readFileSync(path.join(__dirname, 'banner.txt'), 'utf8') + +const options = [ + { + name: 'tenant', + alias: 't', + type: String, + description: 'The unique ID of that tenant that you would like to generate merge suggestions for.', + }, + { + name: 'help', + alias: 'h', + type: Boolean, + description: 'Print this usage guide.', + }, +] +const sections = [ + { + content: banner, + raw: true, + }, + { + header: 'Generate merge suggestions for a tenant', + content: 'Generate merge suggestions for a tenant', + }, + { + header: 'Options', + optionList: options, + }, +] + +const usage = commandLineUsage(sections) +const parameters = commandLineArgs(options) + +if (parameters.help || !parameters.tenant) { + console.log(usage) +} else { + setImmediate(async () => { + const tenantIds = parameters.tenant.split(',') + + for (const tenantId of tenantIds) { + await sendNodeWorkerMessage(tenantId, { + type: NodeWorkerMessageType.NODE_MICROSERVICE, + tenant: tenantId, + service: 'merge-suggestions', + } as NodeWorkerMessageBase) + } + + process.exit(0) + }) +} diff --git a/backend/src/database/repositories/organizationRepository.ts b/backend/src/database/repositories/organizationRepository.ts index 02b4000820..d383dbbf84 100644 --- a/backend/src/database/repositories/organizationRepository.ts +++ b/backend/src/database/repositories/organizationRepository.ts @@ -1047,6 +1047,18 @@ class OrganizationRepository { let yieldChunk: IOrganizationMergeSuggestion[] = [] + const prefixLength = (string: string) => { + if (string.length > 5 && string.length < 8) { + return 6 + } + + if (string.length > 8 && string.length < 12) { + return 9 + } + + return 10 + } + const normalizeScore = (max: number, min: number, score: number): number => { if (score > 100) { return 1 @@ -1064,19 +1076,9 @@ class OrganizationRepository { const queryBody = { from: 0, size: BATCH_SIZE, - query: { - bool: { - must: [ - { - term: { - uuid_tenantId: tenant.id, - }, - }, - ], - }, - }, + query: {}, sort: { - [`date_createdAt`]: 'desc', + [`uuid_organizationId`]: 'asc', }, collapse: { field: 'uuid_organizationId', @@ -1084,12 +1086,43 @@ class OrganizationRepository { _source: ['uuid_organizationId', 'nested_identities', 'uuid_arr_noMergeIds'], } - let organizations: IOrganizationPartialAggregatesOpensearch[] - let offset: number + let organizations: IOrganizationPartialAggregatesOpensearch[] = [] + let lastUuid: string do { - offset = organizations ? offset + BATCH_SIZE : 0 - queryBody.from = offset + if (organizations.length > 0) { + queryBody.query = { + bool: { + filter: [ + { + term: { + uuid_tenantId: tenant.id, + }, + }, + { + range: { + uuid_organizationId: { + gt: lastUuid + } + } + } + ], + }, + } + } + else { + queryBody.query = { + bool: { + filter: [ + { + term: { + uuid_tenantId: tenant.id, + }, + }, + ], + }, + } + } organizations = ( @@ -1099,6 +1132,10 @@ class OrganizationRepository { }) ).body?.hits?.hits || [] + if (organizations.length > 0) { + lastUuid = organizations[organizations.length - 1]._source.uuid_organizationId + } + for (const organization of organizations) { if ( organization._source.nested_identities && @@ -1161,11 +1198,15 @@ class OrganizationRepository { }, }) + // some identities have https? in the beginning, resulting in false positive suggestions + // remove these when making fuzzy, wildcard and prefix searches + const cleanedIdentityName = identity.string_name.replace(/^https?:\/\//, '') + // fuzzy search for identities identitiesPartialQuery.should[1].nested.query.bool.should.push({ match: { [`nested_identities.string_name`]: { - query: identity.string_name, + query: cleanedIdentityName, prefix_length: 1, fuzziness: 'auto', }, @@ -1176,17 +1217,17 @@ class OrganizationRepository { identitiesPartialQuery.should[1].nested.query.bool.should.push({ wildcard: { [`nested_identities.string_name`]: { - value: `${identity.string_name}*`, + value: `${cleanedIdentityName}*`, }, }, }) - // also check for prefix of 5 if identity is longer then 5 characters + // also check for prefix for identities that has more than 5 characters if (identity.string_name.length > 5) { identitiesPartialQuery.should[1].nested.query.bool.should.push({ prefix: { [`nested_identities.string_name`]: { - value: identity.string_name.slice(0, 5), + value: cleanedIdentityName.slice(0, prefixLength(cleanedIdentityName)), }, }, }) diff --git a/backend/src/services/organizationService.ts b/backend/src/services/organizationService.ts index ae3c5a7bdb..6993c52e55 100644 --- a/backend/src/services/organizationService.ts +++ b/backend/src/services/organizationService.ts @@ -243,11 +243,13 @@ export default class OrganizationService extends LoggerBase { } async generateMergeSuggestions(type: OrganizationMergeSuggestionType): Promise { + this.options.log.trace(`Generating merge suggestions for: ${this.options.currentTenant.id}`) const transaction = await SequelizeRepository.createTransaction(this.options) try { if (type === OrganizationMergeSuggestionType.BY_IDENTITY) { let mergeSuggestions + let hasSuggestions = false const generator = OrganizationRepository.getMergeSuggestions({ ...this.options, @@ -256,6 +258,17 @@ export default class OrganizationService extends LoggerBase { do { mergeSuggestions = await generator.next() + if (mergeSuggestions.value) { + this.options.log.trace( + `Tenant: ${this.options.currentTenant.id}, adding ${mergeSuggestions.value.length} organizations to suggestions!`, + ) + hasSuggestions = true + } else if (!hasSuggestions) { + this.options.log.trace(`Tenant doesn't have any merge suggestions`) + } else { + this.options.log.trace(`Finished going tru all suggestions!`) + } + await OrganizationRepository.addToMerge(mergeSuggestions.value, this.options) } while (!mergeSuggestions.done) } From 71c7cf699d6caf4a09869e4aaa188055f987ac47 Mon Sep 17 00:00:00 2001 From: anilb Date: Mon, 9 Oct 2023 17:49:52 +0200 Subject: [PATCH 2/7] formatting --- .../bin/scripts/generate-merge-suggestions.ts | 3 ++- .../repositories/organizationRepository.ts | 17 ++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backend/src/bin/scripts/generate-merge-suggestions.ts b/backend/src/bin/scripts/generate-merge-suggestions.ts index 4fa2854772..911f5fc184 100644 --- a/backend/src/bin/scripts/generate-merge-suggestions.ts +++ b/backend/src/bin/scripts/generate-merge-suggestions.ts @@ -15,7 +15,8 @@ const options = [ name: 'tenant', alias: 't', type: String, - description: 'The unique ID of that tenant that you would like to generate merge suggestions for.', + description: + 'The unique ID of that tenant that you would like to generate merge suggestions for.', }, { name: 'help', diff --git a/backend/src/database/repositories/organizationRepository.ts b/backend/src/database/repositories/organizationRepository.ts index d383dbbf84..a8b5b1953a 100644 --- a/backend/src/database/repositories/organizationRepository.ts +++ b/backend/src/database/repositories/organizationRepository.ts @@ -1102,15 +1102,14 @@ class OrganizationRepository { { range: { uuid_organizationId: { - gt: lastUuid - } - } - } + gt: lastUuid, + }, + }, + }, ], }, } - } - else { + } else { queryBody.query = { bool: { filter: [ @@ -1132,8 +1131,8 @@ class OrganizationRepository { }) ).body?.hits?.hits || [] - if (organizations.length > 0) { - lastUuid = organizations[organizations.length - 1]._source.uuid_organizationId + if (organizations.length > 0) { + lastUuid = organizations[organizations.length - 1]._source.uuid_organizationId } for (const organization of organizations) { @@ -1200,7 +1199,7 @@ class OrganizationRepository { // some identities have https? in the beginning, resulting in false positive suggestions // remove these when making fuzzy, wildcard and prefix searches - const cleanedIdentityName = identity.string_name.replace(/^https?:\/\//, '') + const cleanedIdentityName = identity.string_name.replace(/^https?:\/\//, '') // fuzzy search for identities identitiesPartialQuery.should[1].nested.query.bool.should.push({ From 4349b0267d0afaa1e41dc2e52c95201191155939 Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 10 Oct 2023 08:23:05 +0200 Subject: [PATCH 3/7] log calls updated --- backend/src/services/organizationService.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/src/services/organizationService.ts b/backend/src/services/organizationService.ts index 6993c52e55..e2199559c2 100644 --- a/backend/src/services/organizationService.ts +++ b/backend/src/services/organizationService.ts @@ -243,7 +243,7 @@ export default class OrganizationService extends LoggerBase { } async generateMergeSuggestions(type: OrganizationMergeSuggestionType): Promise { - this.options.log.trace(`Generating merge suggestions for: ${this.options.currentTenant.id}`) + this.log.trace(`Generating merge suggestions for: ${this.options.currentTenant.id}`) const transaction = await SequelizeRepository.createTransaction(this.options) try { @@ -259,14 +259,14 @@ export default class OrganizationService extends LoggerBase { mergeSuggestions = await generator.next() if (mergeSuggestions.value) { - this.options.log.trace( + this.log.trace( `Tenant: ${this.options.currentTenant.id}, adding ${mergeSuggestions.value.length} organizations to suggestions!`, ) hasSuggestions = true } else if (!hasSuggestions) { - this.options.log.trace(`Tenant doesn't have any merge suggestions`) + this.log.trace(`Tenant doesn't have any merge suggestions`) } else { - this.options.log.trace(`Finished going tru all suggestions!`) + this.log.trace(`Finished going tru all suggestions!`) } await OrganizationRepository.addToMerge(mergeSuggestions.value, this.options) From 5f940235db25058d458bcce5f54cd12c0baf2bfe Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 10 Oct 2023 08:31:57 +0200 Subject: [PATCH 4/7] merge suggestion worker now first processes org merge suggestions, convenience script does the actual processing --- backend/src/bin/scripts/generate-merge-suggestions.ts | 11 ++--------- .../merge-suggestions/mergeSuggestionsWorker.ts | 6 +++--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/backend/src/bin/scripts/generate-merge-suggestions.ts b/backend/src/bin/scripts/generate-merge-suggestions.ts index 911f5fc184..2d7b2f1519 100644 --- a/backend/src/bin/scripts/generate-merge-suggestions.ts +++ b/backend/src/bin/scripts/generate-merge-suggestions.ts @@ -2,9 +2,7 @@ import commandLineArgs from 'command-line-args' import commandLineUsage from 'command-line-usage' import * as fs from 'fs' import path from 'path' -import { sendNodeWorkerMessage } from '../../serverless/utils/nodeWorkerSQS' -import { NodeWorkerMessageType } from '../../serverless/types/workerTypes' -import { NodeWorkerMessageBase } from '@/types/mq/nodeWorkerMessageBase' +import { mergeSuggestionsWorker } from '@/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker' /* eslint-disable no-console */ @@ -50,13 +48,8 @@ if (parameters.help || !parameters.tenant) { const tenantIds = parameters.tenant.split(',') for (const tenantId of tenantIds) { - await sendNodeWorkerMessage(tenantId, { - type: NodeWorkerMessageType.NODE_MICROSERVICE, - tenant: tenantId, - service: 'merge-suggestions', - } as NodeWorkerMessageBase) + await mergeSuggestionsWorker(tenantId) } - process.exit(0) }) } diff --git a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts index b0285e5d14..a6cdeb9d48 100644 --- a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts +++ b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts @@ -18,6 +18,9 @@ async function mergeSuggestionsWorker(tenantId): Promise { userContext.currentSegments = segments userContext.opensearch = getOpensearchClient(OPENSEARCH_CONFIG) + const organizationService = new OrganizationService(userContext) + await organizationService.generateMergeSuggestions(OrganizationMergeSuggestionType.BY_IDENTITY) + const memberService = new MemberService(userContext) // Splitting these because in the near future we will be treating them differently const byUsername: IMemberMergeSuggestion[] = await memberService.getMergeSuggestions( @@ -32,9 +35,6 @@ async function mergeSuggestionsWorker(tenantId): Promise { IMemberMergeSuggestionsType.SIMILARITY, ) await memberService.addToMerge(bySimilarity) - - const organizationService = new OrganizationService(userContext) - await organizationService.generateMergeSuggestions(OrganizationMergeSuggestionType.BY_IDENTITY) } export { mergeSuggestionsWorker } From a005265e292950cbe56eab59f6e2a64e0444135b Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 10 Oct 2023 09:12:48 +0200 Subject: [PATCH 5/7] discard empty identities when generating org merge suggestions, logging in merge suggestion worker --- .../repositories/organizationRepository.ts | 76 ++++++++++--------- .../mergeSuggestionsWorker.ts | 10 +++ 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/backend/src/database/repositories/organizationRepository.ts b/backend/src/database/repositories/organizationRepository.ts index a8b5b1953a..42e6ee190a 100644 --- a/backend/src/database/repositories/organizationRepository.ts +++ b/backend/src/database/repositories/organizationRepository.ts @@ -1185,51 +1185,55 @@ class OrganizationRepository { } for (const identity of organization._source.nested_identities) { - // weak identity search - identitiesPartialQuery.should[0].nested.query.bool.should.push({ - bool: { - must: [ - { match: { [`nested_weakIdentities.string_name`]: identity.string_name } }, - { - match: { [`nested_weakIdentities.string_platform`]: identity.string_platform }, - }, - ], - }, - }) - - // some identities have https? in the beginning, resulting in false positive suggestions - // remove these when making fuzzy, wildcard and prefix searches - const cleanedIdentityName = identity.string_name.replace(/^https?:\/\//, '') - - // fuzzy search for identities - identitiesPartialQuery.should[1].nested.query.bool.should.push({ - match: { - [`nested_identities.string_name`]: { - query: cleanedIdentityName, - prefix_length: 1, - fuzziness: 'auto', + if (identity.string_name.length > 0) { + // weak identity search + identitiesPartialQuery.should[0].nested.query.bool.should.push({ + bool: { + must: [ + { match: { [`nested_weakIdentities.string_name`]: identity.string_name } }, + { + match: { + [`nested_weakIdentities.string_platform`]: identity.string_platform, + }, + }, + ], }, - }, - }) + }) - // wildcard search for identities - identitiesPartialQuery.should[1].nested.query.bool.should.push({ - wildcard: { - [`nested_identities.string_name`]: { - value: `${cleanedIdentityName}*`, + // some identities have https? in the beginning, resulting in false positive suggestions + // remove these when making fuzzy, wildcard and prefix searches + const cleanedIdentityName = identity.string_name.replace(/^https?:\/\//, '') + + // fuzzy search for identities + identitiesPartialQuery.should[1].nested.query.bool.should.push({ + match: { + [`nested_identities.string_name`]: { + query: cleanedIdentityName, + prefix_length: 1, + fuzziness: 'auto', + }, }, - }, - }) + }) - // also check for prefix for identities that has more than 5 characters - if (identity.string_name.length > 5) { + // wildcard search for identities identitiesPartialQuery.should[1].nested.query.bool.should.push({ - prefix: { + wildcard: { [`nested_identities.string_name`]: { - value: cleanedIdentityName.slice(0, prefixLength(cleanedIdentityName)), + value: `${cleanedIdentityName}*`, }, }, }) + + // also check for prefix for identities that has more than 5 characters + if (identity.string_name.length > 5) { + identitiesPartialQuery.should[1].nested.query.bool.should.push({ + prefix: { + [`nested_identities.string_name`]: { + value: cleanedIdentityName.slice(0, prefixLength(cleanedIdentityName)), + }, + }, + }) + } } } diff --git a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts index a6cdeb9d48..4466190f3f 100644 --- a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts +++ b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts @@ -10,6 +10,9 @@ import { import SegmentService from '../../../../services/segmentService' import OrganizationService from '@/services/organizationService' import { OPENSEARCH_CONFIG } from '@/conf' +import { getServiceChildLogger } from '@crowd/logging' + +const log = getServiceChildLogger('mergeSuggestionsWorker') async function mergeSuggestionsWorker(tenantId): Promise { const userContext: IRepositoryOptions = await getUserContext(tenantId) @@ -18,9 +21,15 @@ async function mergeSuggestionsWorker(tenantId): Promise { userContext.currentSegments = segments userContext.opensearch = getOpensearchClient(OPENSEARCH_CONFIG) + log.info(`Generating organization merge suggestions for tenant ${tenantId}!`) + const organizationService = new OrganizationService(userContext) await organizationService.generateMergeSuggestions(OrganizationMergeSuggestionType.BY_IDENTITY) + log.info(`Done generating organization merge suggestions for tenant ${tenantId}!`) + + log.info(`Generating member merge suggestions for tenant ${tenantId}!`) + const memberService = new MemberService(userContext) // Splitting these because in the near future we will be treating them differently const byUsername: IMemberMergeSuggestion[] = await memberService.getMergeSuggestions( @@ -35,6 +44,7 @@ async function mergeSuggestionsWorker(tenantId): Promise { IMemberMergeSuggestionsType.SIMILARITY, ) await memberService.addToMerge(bySimilarity) + log.info(`Done generating member merge suggestions for tenant ${tenantId}!`) } export { mergeSuggestionsWorker } From bce35e3f69bfb424c1912ad8c8f3f4afae9d626b Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 10 Oct 2023 09:16:59 +0200 Subject: [PATCH 6/7] linting --- .../nodejs/merge-suggestions/mergeSuggestionsWorker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts index 4466190f3f..295b2ef6c3 100644 --- a/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts +++ b/backend/src/serverless/microservices/nodejs/merge-suggestions/mergeSuggestionsWorker.ts @@ -1,4 +1,5 @@ import { getOpensearchClient } from '@crowd/opensearch' +import { getServiceChildLogger } from '@crowd/logging' import { OrganizationMergeSuggestionType } from '@crowd/types' import getUserContext from '../../../../database/utils/getUserContext' import MemberService from '../../../../services/memberService' @@ -10,7 +11,6 @@ import { import SegmentService from '../../../../services/segmentService' import OrganizationService from '@/services/organizationService' import { OPENSEARCH_CONFIG } from '@/conf' -import { getServiceChildLogger } from '@crowd/logging' const log = getServiceChildLogger('mergeSuggestionsWorker') From 31112c671b06039603562874e7144e62d79abae6 Mon Sep 17 00:00:00 2001 From: anilb Date: Tue, 10 Oct 2023 09:19:39 +0200 Subject: [PATCH 7/7] sorting organization merge suggestions using similarity desc --- backend/src/database/repositories/organizationRepository.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/database/repositories/organizationRepository.ts b/backend/src/database/repositories/organizationRepository.ts index 42e6ee190a..1aca5ea1cd 100644 --- a/backend/src/database/repositories/organizationRepository.ts +++ b/backend/src/database/repositories/organizationRepository.ts @@ -1329,7 +1329,7 @@ class OrganizationRepository { AND os."segmentId" IN (:segmentIds) ) AS "organizationsToMerge" ORDER BY - "organizationsToMerge"."createdAt" DESC + "organizationsToMerge"."similarity" DESC LIMIT :limit OFFSET :offset `, {