implement script to populate projects with users

jng34 · jng34 · commit 1153e29b9699 · 2025-08-20T20:35:29.000-04:00
diff --git a/backend/scripts/.gitignore b/backend/scripts/.gitignore
@@ -0,0 +1,3 @@
+# Created by venv; see https://docs.python.org/3/library/venv.html
+adjustedreqs.txt
+venv/
diff --git a/backend/scripts/python/env/Populate Projects.ipynb b/backend/scripts/python/env/Populate Projects.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b9b6c5e5-4b20-4407-9542-3bea81ab742e",
+   "metadata": {},
+   "source": [
+    "# Setup\n",
+    "For dev, you must have the backend api running on your computer. For prod, please change USER_API_URL to reflect the production url."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 393,
+   "id": "d04b046c-ad92-4f9b-a7d1-c900c1ff4581",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "import os\n",
+    "import re\n",
+    "import pprint as pp\n",
+    "from dotenv import load_dotenv\n",
+    "from bson.objectid import ObjectId\n",
+    "from datetime import datetime\n",
+    "from functools import reduce\n",
+    "from pymongo import MongoClient, ReturnDocument, UpdateOne\n",
+    "from pymongo.errors import BulkWriteError\n",
+    "\n",
+    "load_dotenv()\n",
+    "custom_request_header = os.getenv(\"CUSTOM_REQUEST_HEADER\")\n",
+    "DATABASE_URL = os.getenv(\"DATABASE_URL\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "852bea67-8354-49df-b6fb-c766f305ee8a",
+   "metadata": {},
+   "source": [
+    "# Connect to database and check current list of DBs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 395,
+   "id": "33d48fca-a40d-4619-b97b-46b598258967",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['backup_db', 'testdb', 'vrms-populate-projects-test', 'vrms-slack-dev', 'vrms-slack-main', 'vrms-slack-staging', 'vrms-test', 'vrms-test-2', 'vrms-test-3', 'vrms-test-4', 'vrms-test-5', 'vrms-test-6', 'vrms-test-clone-project-sync', 'vrms-test-copy', 'vrms-test-sync', 'vrms-user-migration-test', 'admin', 'local']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Connect to MongoDB\n",
+    "client = MongoClient(DATABASE_URL)\n",
+    "print(client.list_database_names())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e4d3414-f130-4e76-9506-efd468d401df",
+   "metadata": {},
+   "source": [
+    "# Create a new test database\n",
+    "\n",
+    "Define a source and copy for databases\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 396,
+   "id": "68a7e8a9-e3f3-4231-8424-8b8dd44f522f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db_source = client['vrms-test']\n",
+    "db_copy = client['vrms-populate-projects-test']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6565ea84-e799-40d0-a56b-7859620db461",
+   "metadata": {},
+   "source": [
+    "# Drop all collections in test database (ONLY IF NECESSARY!)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 405,
+   "id": "a4cb07f2-3e55-4a2e-8358-96bf67ebf354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for collection_name in db_copy.list_collection_names():\n",
+    "#     db_copy.drop_collection(collection_name)\n",
+    "#     print(f\"Dropped collection: {collection_name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "141b69ae-a407-4c41-a551-33f547244eb0",
+   "metadata": {},
+   "source": [
+    "# Copy Users and Projects collections from source -> test databases\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 398,
+   "id": "fd46eb06-d246-455e-8f48-a4e5df0efc9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "users_collection = db_source['users']\n",
+    "users = list(users_collection.find())\n",
+    "projects_collection = db_source['projects']\n",
+    "projects = list(projects_collection.find())\n",
+    "\n",
+    "users_copy = db_copy['users']\n",
+    "projects_copy = db_copy['projects']\n",
+    "\n",
+    "try:\n",
+    "    users_copy.insert_many(users, ordered=False) # Copy source db users to test db users\n",
+    "    projects_copy.insert_many(projects, ordered=False) # Copy source db projects to test db projects\n",
+    "except BulkWriteError as bwe:\n",
+    "    print(\"BulkWriteError details:\")\n",
+    "    print(bwe.details)  # This contains info on which documents failed and why"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c8b8712-7654-4f42-96c2-3809d33d214a",
+   "metadata": {},
+   "source": [
+    "# Get Users with at least one managedProjects\n",
+    "\n",
+    "Retrieve a list of all users with at least one managedProject.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 399,
+   "id": "d4f52891-72c0-440c-8ef1-0f2102cebdb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = {\n",
+    "  \"managedProjects\": { \n",
+    "      \"$exists\": True, \n",
+    "      \"$not\": { \"$size\": 0 } \n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "target_users = list(users_copy.find(query))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de61c365-ec09-4acf-b863-221067f988db",
+   "metadata": {},
+   "source": [
+    "# Create an dictionary called `projects_users`\n",
+    "\n",
+    "The dict has project IDs as keys and arrays of user IDs as values\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 400,
+   "id": "dd384405-c9bc-4b00-bb9b-8dcd4be0e9ba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'68a3e64ee2653c001fe3ff3b': [ObjectId('6481155fab091f001e30925b'),\n",
+      "                              ObjectId('66024c13e6a0050028e07948'),\n",
+      "                              ObjectId('670dd397cace6a002abb20ce')],\n",
+      " '68a3e75ea19d60385b3938f8': [ObjectId('670dd397cace6a002abb20ce')]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "projects_users = {}\n",
+    "\n",
+    "# Function to filter only projects with valid mongoose IDs\n",
+    "def filter_valid_mongoose_ids(id_list):\n",
+    "    return [x for x in id_list if ObjectId.is_valid(x)]\n",
+    "\n",
+    "for user in target_users:\n",
+    "    # Destructure id and managed projects from user\n",
+    "    _id, managed_projects = user['_id'], user['managedProjects']\n",
+    "\n",
+    "    # Filter projects\n",
+    "    filtered_projects = filter_valid_mongoose_ids(managed_projects)\n",
+    "\n",
+    "    for proj_id in filtered_projects:\n",
+    "        if proj_id in projects_users:\n",
+    "            projects_users[f\"{proj_id}\"].append(_id)\n",
+    "        else:\n",
+    "            projects_users[f\"{proj_id}\"] = [_id]\n",
+    "\n",
+    "pp.pprint(projects_users)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a34d198a-ce32-41af-b4e2-2be590a6f5a6",
+   "metadata": {},
+   "source": [
+    "# Update `managedByUsers` field in Projects \n",
+    "\n",
+    "Update all project's `managedByUsers` array using bulk write"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 404,
+   "id": "f280d029-47ed-46ef-a8d1-731071600a49",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Project before update:\n",
+      "{'__v': 0,\n",
+      " '_id': ObjectId('68a3e64ee2653c001fe3ff3b'),\n",
+      " 'createdDate': datetime.datetime(2025, 8, 19, 2, 49, 50, 843000),\n",
+      " 'description': 'Testing...',\n",
+      " 'githubIdentifier': 'lkjlkj',\n",
+      " 'githubUrl': 'lkjlk',\n",
+      " 'googleDriveUrl': 'https://drive.google.com/drive/folders/1hAq0wyZKOaZLujqOYiaFv5PYgooISger?usp=drive_link',\n",
+      " 'hflaWebsiteUrl': 'lkjlkj',\n",
+      " 'managedByUsers': [ObjectId('6481155fab091f001e30925b'),\n",
+      "                    ObjectId('66024c13e6a0050028e07948'),\n",
+      "                    ObjectId('670dd397cace6a002abb20ce')],\n",
+      " 'name': 'Jacks Test Project',\n",
+      " 'partners': [],\n",
+      " 'projectStatus': 'Active',\n",
+      " 'recruitingCategories': [],\n",
+      " 'slackUrl': 'lkjlkj'}\n",
+      "Project before update:\n",
+      "{'__v': 0,\n",
+      " '_id': ObjectId('68a3e75ea19d60385b3938f8'),\n",
+      " 'createdDate': datetime.datetime(2025, 8, 19, 2, 54, 22, 871000),\n",
+      " 'description': 'afk',\n",
+      " 'githubIdentifier': 'afk',\n",
+      " 'githubUrl': 'afk',\n",
+      " 'googleDriveUrl': 'https://drive.google.com/test',\n",
+      " 'hflaWebsiteUrl': 'afk',\n",
+      " 'managedByUsers': [ObjectId('670dd397cace6a002abb20ce')],\n",
+      " 'name': 'VRMS Test Project',\n",
+      " 'partners': [],\n",
+      " 'projectStatus': 'Active',\n",
+      " 'recruitingCategories': [],\n",
+      " 'slackUrl': 'afk'}\n",
+      "Result:  BulkWriteResult({'writeErrors': [], 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 0, 'nMatched': 2, 'nModified': 0, 'nRemoved': 0, 'upserted': []}, acknowledged=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "operations = []\n",
+    "\n",
+    "for proj_id, user_ids in projects_users.items():\n",
+    "    valid_user_ids = [uid for uid in user_ids if ObjectId.is_valid(uid)]    \n",
+    "\n",
+    "    proj = projects_copy.find_one({\"_id\": ObjectId(proj_id)})\n",
+    "\n",
+    "    if proj:\n",
+    "        print('Project before update:')\n",
+    "        pp.pprint(proj)\n",
+    "        \n",
+    "        # Compile individual updates in operations \n",
+    "        operations.append(UpdateOne(\n",
+    "            {\"_id\": ObjectId(proj_id)}, # Filter\n",
+    "            {\"$set\": {\"managedByUsers\": valid_user_ids}}, # Update\n",
+    "        ))\n",
+    "    else:\n",
+    "        print(f\"No project with {proj_id} found\")\n",
+    "\n",
+    "# Execute the bulk write to update operations\n",
+    "result = projects_copy.bulk_write(operations)\n",
+    "\n",
+    "print(f\"Result: \", result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Created by venv; see https://docs.python.org/3/library/venv.html`
	`2`	`+adjustedreqs.txt`
	`3`	`+venv/`