This commit is contained in:
Charles-Gormley
2024-11-30 23:26:17 -05:00
parent 28e964fa10
commit 9dda7ff55a
10 changed files with 1296 additions and 289398 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

227
monthly_ingestion.ipynb Normal file
View File

@@ -0,0 +1,227 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RSS Article Batch Download Examples\n",
"\n",
"This notebook demonstrates how to batch download RSS articles from S3."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from time import time\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Root imports\n",
"\n",
"from src.search.batch import S3BatchDownloader\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"downloader = S3BatchDownloader()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the Downloader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"List permission: Allowed\n"
]
}
],
"source": [
"import boto3\n",
"from botocore.exceptions import ClientError\n",
"\n",
"s3 = boto3.client('s3')\n",
"\n",
"# Test ListObjects\n",
"try:\n",
" response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n",
" print(\"List permission: Allowed\")\n",
"except ClientError as e:\n",
" print(\"List permission: Denied\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 59592 objects to process\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 53%|█████▎ | 31707/59592 [04:33<6:08:33, 1.26object/s]IOStream.flush timed out\n",
"100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to November-1.csv in 605.81 seconds\n"
]
}
],
"source": [
"start = time()\n",
"output_path = \"November-1.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-22\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 50037 objects to process\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 66%|██████▌ | 33067/50037 [04:07<11:49, 23.92object/s] IOStream.flush timed out\n",
"100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to November-2.csv in 539.74 seconds\n"
]
}
],
"source": [
"start = time()\n",
"output_path = \"November-2.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-23\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-27\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"start = time()\n",
"output_path = \"November-3.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-8\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-30\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Aggregating"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -5,3 +5,4 @@ constructs==10.2.69
# Optional, yet necessary for the Pinecone SDK functionality. # Optional, yet necessary for the Pinecone SDK functionality.
pinecone pinecone
openai openai
tqdm

View File

@@ -93,5 +93,910 @@
"u": "https://www.aspireiq.com/blog/rss.xml", "u": "https://www.aspireiq.com/blog/rss.xml",
"dt": 1699830841, "dt": 1699830841,
"update": 1 "update": 1
},
{
"u": "https://theinfluencermarketingfactory.com/blogs/influencer-marketing-tips.atom",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.forbes.com/personal-brands/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.forbes.com/event-hosting/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://cmxhub.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.thepodcasthost.com/feed/podcast/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://podcastmovementsessions.libsyn.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://podcraft.libsyn.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "http://feeds.feedburner.com/streamingmedia/TV",
"dt": 1699830841,
"update": 1
},
{
"u": "https://livestream.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ecommercetimes.com/perl/syndication/rssfull.pl",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.causevox.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/rss/harvard-business-review",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fastcompany.com/feed/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.thedrum.com/rss/news.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://crisis-response.com/Articles/index.php/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://prcg.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.odwyerpr.com/rss-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.prweek.com/uk/rss/news",
"dt": 1699830841,
"update": 1
},
{
"u": "http://feeds.feedburner.com/clickboothblog",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.feedburner.com/avantlinkblog",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.slickdeals.net/feeds/affiliate/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://neoreach.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://onalytica.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.braginskyassociates.com/blog-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.sociallyawareblog.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mediapost.com/rss/news/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nikonusa.com/en/learn-and-explore/index.page",
"dt": 1699830841,
"update": 1
},
{
"u": "https://500px.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.modernretail.co.uk/category/industry/influencer-marketing/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.six-spoke.com/news-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://izea.com/blogs/feeds/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.socialchain.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lumanu.com/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://later.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.hopperhq.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://fohr.co/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/SCOTUS/all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.washingtonpost.com/rss/politics/supreme-court",
"dt": 1699830841,
"update": 1
},
{
"u": "https://supreme.justia.com/supreme-court-rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.cornell.edu/supct/rss/supct_recent.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/criminal-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/criminal-justice-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lexology.com/feed/all/Civil%20Procedure",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/civil-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/civil/articles",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/civil-procedure/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://abovethelaw.com/category/litigation/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jdsupra.com/rss/feed?tags=civil-litigation",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/corporate-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/corporate-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/corporate/articles",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jdsupra.com/rss/feed?tags=corporate-law",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.feedburner.com/FindLawCorporateCounselNews",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/intellectual-property-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/intellectual+property+law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ipprotheinternet.com/RSS",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jamsadr.com/rss.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cpradr.org/news-publications/articles/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mediation.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cedr.com/newsandevents/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://healthpolicy.usc.edu/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fda.gov/about-fda/rss-feeds-fda",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.aha.org/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.beckershospitalreview.com/feeds/rss-articles.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.securityweek.com/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cnet.com/rss/news/security/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.ucla.edu/centers/social-policy/education-policy/news-and-events/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brookings.edu/feed/?type=content&cat=167",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.chalkbeat.org/feeds/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.insidehighered.com/rss/news",
"dt": 1699830841,
"update": 1
},
{
"u": "https://opil.ouplaw.com/search-result?siteToSearch=OCO&searchType=advanced&pageSize=10&sort=relevance&pageSize=10&sort=date-desc&q=*&fq=contentType:BookItem+OR+contentType:ChapterItem+OR+contentType:JournalItem+OR+contentType:ReferenceItem&fq=subject_Law:Constitutional+and+administrative+law&fl=title%2Cabstract%2CbodyText%2CbookTitle%2CjournalTitle%2Cauthor%2CpublisherName%2CpublicationDate%2CpublicationDateDisplay%2Cisbn%2Cdoi%2Cid%2Csubjects%2CcontentTypes%2CpeerReviewed%2CopenAccess%2Curl%2CthumbnailUrl%2CpdfUrl%2CxmlUrl%2ChtmlUrl%2CdocumentType%2Clanguage%2CaccessProviderName%2CaccessProviderDisplay&start=0",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.yalejreg.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/legal+ethics/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://georgetownlawjournal.org/feeds/journal-legal-ethics",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lexology.com/gatewayrss/22/Feed.aspx?g=9f7d415b-070a-4c47-a086-c8b7dbb08d06",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/content/dam/aba/publications/professional_lawyer/ProfLawyer_Journal_Fall_2019.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hq.ssrn.com/jourInvite.cfm?link=Legal-Writing-eJournal",
"dt": 1699830841,
"update": 1
},
{
"u": "https://writingcenterblog.unc.edu/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.legalbluebook.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.scribes.org/legal-writing-news?format=RSS",
"dt": 1699830841,
"update": 1
},
{
"u": "https://reason.com/volokh-conspiracy/feed/?cat=legal-writing",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.uscourts.gov/news/rss-feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brennancenter.org/topics/judicial-selection/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.acslaw.org/our-work/resources/feed/?category=judicial-nominations",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.pewresearch.org/topics/u-s-politics-policy/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.courtlistener.com/docket/rss/ctapp/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.upenn.edu/live/rss/news.php",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/legal_education/resources/legal-education-news-feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://taxprof.typepad.com/taxprof_blog/legal_education/index.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/public_services/lawyer-referral/public-interest-connections/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.com/rss/nlj/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/legalindustry",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.washingtonpost.com/news/volokh-conspiracy/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brennancenter.org/topics/public-interest-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.com/rss/technology/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/law+firm+management/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lawtechnologytoday.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/pro_bono/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.legalaidnyc.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.floridabar.org/publications/news/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://immigrantjustice.org/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/legal_services/publications/solo_small_firm_resources/solo-in-a-box-rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nlada.org/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.probono.net/news/rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nalp.org/rss/news.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nita.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "http://www.federalevidence.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://legaltalknetwork.com/feed/podcast/in-the-mind-of-a-trial-lawyer/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.courtroomview.com/category/news/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lawfareblog.com/topic/international-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ibanet.org/RSS/IBA-RSS-News.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.asil.org/resources/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.icc-cpi.int/NewsFeed.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.georgetown.edu/international-law-journal/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ejil.org/rss_feeds.html",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/businessweekrss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.economist.com/sections/business-finance/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.industryweek.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.scdigest.com/rssfeeds.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.healthcarefinancenews.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ehstoday.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/markets/sitemap.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fool.com/feeds/all.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bea.gov/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.worldbank.org/en/news/all/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://unece.org/info/news/news/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.census.gov/economic-indicators/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://venturebeat.com/category/startups/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/rss/topic/entrepreneurship",
"dt": 1699830841,
"update": 1
},
{
"u": "https://moz.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://salesloft.com/resources/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bdc.ca/en/articles-tools/entrepreneur-toolkit/business-strategy-planning/marketing/pages/default.aspx?rss=true",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.crmbuyer.com/perl/syndication/rssfull.pl",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.copper.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://blog.hubspot.com/crm/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "http://www.scdigest.com/rssfeeds.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.manufacturing.net/rss/all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.industryweek.com/rss/allarticles.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.hrmagazine.co.uk/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/feeds/topics/organizational-culture",
"dt": 1699830841,
"update": 1
},
{
"u": "https://cultureiq.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://officevibe.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/feeds/topics/leadership",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mckinsey.com/rss/insights/leadership",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.strategy-business.com/rss/stratcon_all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.gsb.stanford.edu/sites/default/files/feeds/cgri.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.navexglobal.com/en-us/company/blog/ethicscompliance/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.accountingtoday.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.journalofaccountancy.com/rss.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.wsj.com/news/cfo-journal/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.bloomberg.com/professional/accounting-news/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.fm-magazine.com/news/rss.html",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.accountingweb.com/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.risk.net/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.sec.gov/rss/pressrel.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.riskmanagementmonitor.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.isaca.org/resources/news-and-trends/isaca-now-blog/risk-management",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mergermarket.com/rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/professional/ma-news/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://pitchbook.com/news/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.wsj.com/news/business/deals",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.forbes.com/mergers-acquisitions/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.accountingtoday.com/tag/mergers-acquisitions",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.crowdfundinsider.com/category/equity-crowdfunding/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.crowdfundinsider.com/category/p2p-lending/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.fundable.com/blog/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.startengine.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/topics/international-trade/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.wto.org/rss/news_rss_e.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://unctad.org/news/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.forbes.com/international-business/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.sba.gov/rss/news.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://quickbooks.intuit.com/r/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.score.org/blog/feed",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.americanbar.org/content/dam/aba/rss/news/abanews.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.natlawreview.com/rss-feed.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://rss.nytimes.com/services/xml/rss/nyt/BusinessLaw.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://angel.co/blog.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/technology/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://sloanreview.mit.edu/topics/innovation/feed/",
"dt": 1699830841,
"update": 0
} }
] ]

View File

@@ -4,6 +4,10 @@ import sys
import json import json
from src.utils.retry_logic import retry_with_backoff from src.utils.retry_logic import retry_with_backoff
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv(override=True) load_dotenv(override=True)
@@ -190,5 +194,23 @@ def deploy_infrastructure():
} }
]) ])
if os.getenv("STORAGE_STRATEGY") == 'pinecone':
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_DB_NAME")
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=int(embedding_dim),
metric=vector_search_metric,
spec = ServerlessSpec(
cloud="aws",
region=os.getenv("AWS_REGION"),
),
)
if __name__ == "__main__": if __name__ == "__main__":
deploy_infrastructure() deploy_infrastructure()

View File

@@ -1,13 +1,14 @@
import os import os
from pinecone import Pinecone from pinecone import Pinecone
from openai import OpenAI from openai import OpenAI
# Set up Pinecone client # Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY") api_key = os.getenv("PINCEONE_API_KEY")
shards = os.getenv("PINECONE_SHARDS") shards = os.getenv("PINECONE_SHARDS")
embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL") embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
embedding_dim = int(os.getenv("VECTOR_EMBEDDING_DIM")) embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC") vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
index_name = os.getenv("PINECONE_DB_NAME") index_name = os.getenv("PINECONE_DB_NAME")
@@ -16,12 +17,7 @@ pc = Pinecone(api_key=api_key)
def get_index(): def get_index():
if index_name not in pc.list_indexes().names(): if index_name not in pc.list_indexes().names():
pc.create_index( return KeyError(f"Index {index_name} not found")
name=index_name,
dimension=embedding_dim,
metric=vector_search_metric,
shards=shards
)
index = pc.Index(index_name) index = pc.Index(index_name)
return index return index
@@ -29,7 +25,7 @@ def get_index():
def vectorize(article:str) -> list[float]: def vectorize(article:str) -> list[float]:
response = client.embeddings.create( response = client.embeddings.create(
input=article, input=article,
model=os.getenv('OPENAI_EMBEDDING_MODEL') model=os.getenv('OPENAI_EMBEDDING_MODEL', 'text')
) )
return response.data[0].embedding return response.data[0].embedding

View File

@@ -34,7 +34,6 @@ def pinecone_save_article(article:dict):
article["id"] = article["article_id"] article["id"] = article["article_id"]
article["values"] = vectorize(article["content"]) article["values"] = vectorize(article["content"])
namespace = os.getenv('PINECONE_NAMESPACE') namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone") logger.info("Upserting article to Pinecone")

View File

@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone from datetime import datetime, timezone
import logging import logging
from string import Template from string import Template
from tqdm import tqdm
class S3BatchDownloader: class S3BatchDownloader:
"""Class for batch downloading RSS articles from S3""" """Class for batch downloading RSS articles from S3"""
@@ -93,15 +94,17 @@ class S3BatchDownloader:
if self._is_in_date_range(obj['LastModified'], start_ts, end_ts) if self._is_in_date_range(obj['LastModified'], start_ts, end_ts)
] ]
self.logger.info(f"Found {len(objects)} objects to process") self.logger.info(f"Found {len(objects)} objects to process")
print(f"Found {len(objects)} objects to process")
# Download and merge data # Download and merge data
all_data = [] all_data = []
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor: with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor, tqdm(total=len(objects), unit="object") as progress_bar:
future_to_obj = {executor.submit(self._download_object, obj): obj for obj in objects} future_to_obj = {executor.submit(self._download_object, obj): obj for obj in objects}
for future in as_completed(future_to_obj): for future in as_completed(future_to_obj):
result = future.result() result = future.result()
if result is not None: if result is not None:
all_data.extend(result if isinstance(result, list) else [result]) all_data.extend(result if isinstance(result, list) else [result])
progress_bar.update(1)
# Save to file # Save to file
self._save_to_file(all_data, output_path, file_format) self._save_to_file(all_data, output_path, file_format)

View File

@@ -1,9 +1,15 @@
# After Public Launch # After Public Launch
* Monthly Kaggle Dataset Publishing. * Monthly Kaggle Dataset Publishing.
* Vector Database Initialization at earlier phase. [ Done ]
* Test out Vector Databases at Small Scale. * Test out Vector Databases at Small Scale.
* [ ] Testing
* [ ] Fix OpenAI Error.
* [ ] Let it run for a day
* Test out Vector Databases at Scale. * Test out Vector Databases at Scale.
* Monthly ingestion job
* Protocol for annotating data. * Protocol for annotating data.
* Promotional Material * Promotional Material
* Stuff that is not news * Stuff that is not news