This commit is contained in:
Charles-Gormley
2024-11-30 23:26:17 -05:00
parent 28e964fa10
commit 9dda7ff55a
10 changed files with 1296 additions and 289398 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

227
monthly_ingestion.ipynb Normal file
View File

@@ -0,0 +1,227 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RSS Article Batch Download Examples\n",
"\n",
"This notebook demonstrates how to batch download RSS articles from S3."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from time import time\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Root imports\n",
"\n",
"from src.search.batch import S3BatchDownloader\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"downloader = S3BatchDownloader()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the Downloader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"List permission: Allowed\n"
]
}
],
"source": [
"import boto3\n",
"from botocore.exceptions import ClientError\n",
"\n",
"s3 = boto3.client('s3')\n",
"\n",
"# Test ListObjects\n",
"try:\n",
" response = s3.list_objects_v2(Bucket=os.getenv(\"S3_BUCKET_NAME\"))\n",
" print(\"List permission: Allowed\")\n",
"except ClientError as e:\n",
" print(\"List permission: Denied\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 59592 objects to process\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 53%|█████▎ | 31707/59592 [04:33<6:08:33, 1.26object/s]IOStream.flush timed out\n",
"100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to November-1.csv in 605.81 seconds\n"
]
}
],
"source": [
"start = time()\n",
"output_path = \"November-1.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-17\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-22\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 50037 objects to process\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 66%|██████▌ | 33067/50037 [04:07<11:49, 23.92object/s] IOStream.flush timed out\n",
"100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloaded data to November-2.csv in 539.74 seconds\n"
]
}
],
"source": [
"start = time()\n",
"output_path = \"November-2.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-23\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-27\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"start = time()\n",
"output_path = \"November-3.csv\" # or \"consolidated_data.json\"\n",
"\n",
"# Define date range\n",
"start_date = \"2024-11-8\" # FIXME: Fix the error where data can't be collected before the date you started collecting.\n",
"end_date = \"2024-11-30\"\n",
"\n",
"# Start downloading\n",
"downloader.download_to_file(\n",
" output_path=output_path,\n",
" file_format=\"csv\", # or \"json\"\n",
" start_date=start_date,\n",
" end_date=end_date\n",
")\n",
"\n",
"print(f\"Downloaded data to {output_path} in {time() - start:.2f} seconds\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Aggregating"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -5,3 +5,4 @@ constructs==10.2.69
# Optional, yet necessary for the Pinecone SDK functionality.
pinecone
openai
tqdm

View File

@@ -93,5 +93,910 @@
"u": "https://www.aspireiq.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://theinfluencermarketingfactory.com/blogs/influencer-marketing-tips.atom",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.forbes.com/personal-brands/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.forbes.com/event-hosting/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://cmxhub.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.thepodcasthost.com/feed/podcast/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://podcastmovementsessions.libsyn.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://podcraft.libsyn.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "http://feeds.feedburner.com/streamingmedia/TV",
"dt": 1699830841,
"update": 1
},
{
"u": "https://livestream.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ecommercetimes.com/perl/syndication/rssfull.pl",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.causevox.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/rss/harvard-business-review",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fastcompany.com/feed/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.thedrum.com/rss/news.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://crisis-response.com/Articles/index.php/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://prcg.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.odwyerpr.com/rss-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.prweek.com/uk/rss/news",
"dt": 1699830841,
"update": 1
},
{
"u": "http://feeds.feedburner.com/clickboothblog",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.feedburner.com/avantlinkblog",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.slickdeals.net/feeds/affiliate/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://neoreach.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://onalytica.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.braginskyassociates.com/blog-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.sociallyawareblog.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mediapost.com/rss/news/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nikonusa.com/en/learn-and-explore/index.page",
"dt": 1699830841,
"update": 1
},
{
"u": "https://500px.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.modernretail.co.uk/category/industry/influencer-marketing/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.six-spoke.com/news-feed.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://izea.com/blogs/feeds/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.socialchain.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lumanu.com/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://later.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.hopperhq.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://fohr.co/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/SCOTUS/all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.washingtonpost.com/rss/politics/supreme-court",
"dt": 1699830841,
"update": 1
},
{
"u": "https://supreme.justia.com/supreme-court-rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.cornell.edu/supct/rss/supct_recent.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/criminal-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/criminal-justice-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lexology.com/feed/all/Civil%20Procedure",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/civil-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/civil/articles",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/civil-procedure/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://abovethelaw.com/category/litigation/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jdsupra.com/rss/feed?tags=civil-litigation",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/corporate-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/corporate-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/corporate/articles",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jdsupra.com/rss/feed?tags=corporate-law",
"dt": 1699830841,
"update": 1
},
{
"u": "https://feeds.feedburner.com/FindLawCorporateCounselNews",
"dt": 1699830841,
"update": 1
},
{
"u": "https://law.justia.com/feeds/categories/intellectual-property-law.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topics/intellectual+property+law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ipprotheinternet.com/RSS",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.jamsadr.com/rss.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cpradr.org/news-publications/articles/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mediation.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cedr.com/newsandevents/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://healthpolicy.usc.edu/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fda.gov/about-fda/rss-feeds-fda",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.aha.org/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.beckershospitalreview.com/feeds/rss-articles.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.securityweek.com/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.cnet.com/rss/news/security/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.ucla.edu/centers/social-policy/education-policy/news-and-events/rss-feeds",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brookings.edu/feed/?type=content&cat=167",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.chalkbeat.org/feeds/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.insidehighered.com/rss/news",
"dt": 1699830841,
"update": 1
},
{
"u": "https://opil.ouplaw.com/search-result?siteToSearch=OCO&searchType=advanced&pageSize=10&sort=relevance&pageSize=10&sort=date-desc&q=*&fq=contentType:BookItem+OR+contentType:ChapterItem+OR+contentType:JournalItem+OR+contentType:ReferenceItem&fq=subject_Law:Constitutional+and+administrative+law&fl=title%2Cabstract%2CbodyText%2CbookTitle%2CjournalTitle%2Cauthor%2CpublisherName%2CpublicationDate%2CpublicationDateDisplay%2Cisbn%2Cdoi%2Cid%2Csubjects%2CcontentTypes%2CpeerReviewed%2CopenAccess%2Curl%2CthumbnailUrl%2CpdfUrl%2CxmlUrl%2ChtmlUrl%2CdocumentType%2Clanguage%2CaccessProviderName%2CaccessProviderDisplay&start=0",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.yalejreg.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/legal+ethics/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://georgetownlawjournal.org/feeds/journal-legal-ethics",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lexology.com/gatewayrss/22/Feed.aspx?g=9f7d415b-070a-4c47-a086-c8b7dbb08d06",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/content/dam/aba/publications/professional_lawyer/ProfLawyer_Journal_Fall_2019.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hq.ssrn.com/jourInvite.cfm?link=Legal-Writing-eJournal",
"dt": 1699830841,
"update": 1
},
{
"u": "https://writingcenterblog.unc.edu/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.legalbluebook.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.scribes.org/legal-writing-news?format=RSS",
"dt": 1699830841,
"update": 1
},
{
"u": "https://reason.com/volokh-conspiracy/feed/?cat=legal-writing",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.uscourts.gov/news/rss-feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brennancenter.org/topics/judicial-selection/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.acslaw.org/our-work/resources/feed/?category=judicial-nominations",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.pewresearch.org/topics/u-s-politics-policy/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.courtlistener.com/docket/rss/ctapp/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.upenn.edu/live/rss/news.php",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/legal_education/resources/legal-education-news-feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://taxprof.typepad.com/taxprof_blog/legal_education/index.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/public_services/lawyer-referral/public-interest-connections/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.com/rss/nlj/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law360.com/rss/legalindustry",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.washingtonpost.com/news/volokh-conspiracy/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.brennancenter.org/topics/public-interest-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.com/rss/technology/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/law+firm+management/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lawtechnologytoday.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.abajournal.com/topic/pro_bono/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.legalaidnyc.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.floridabar.org/publications/news/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://immigrantjustice.org/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.americanbar.org/groups/legal_services/publications/solo_small_firm_resources/solo-in-a-box-rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nlada.org/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.probono.net/news/rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nalp.org/rss/news.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.nita.org/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "http://www.federalevidence.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://legaltalknetwork.com/feed/podcast/in-the-mind-of-a-trial-lawyer/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.courtroomview.com/category/news/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.lawfareblog.com/topic/international-law/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ibanet.org/RSS/IBA-RSS-News.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.asil.org/resources/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.icc-cpi.int/NewsFeed.aspx",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.law.georgetown.edu/international-law-journal/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ejil.org/rss_feeds.html",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/businessweekrss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.economist.com/sections/business-finance/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.industryweek.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.scdigest.com/rssfeeds.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.healthcarefinancenews.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.ehstoday.com/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/markets/sitemap.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.fool.com/feeds/all.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bea.gov/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.worldbank.org/en/news/all/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://unece.org/info/news/news/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.census.gov/economic-indicators/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://venturebeat.com/category/startups/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/rss/topic/entrepreneurship",
"dt": 1699830841,
"update": 1
},
{
"u": "https://moz.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://salesloft.com/resources/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bdc.ca/en/articles-tools/entrepreneur-toolkit/business-strategy-planning/marketing/pages/default.aspx?rss=true",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.crmbuyer.com/perl/syndication/rssfull.pl",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.copper.com/blog/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://blog.hubspot.com/crm/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "http://www.scdigest.com/rssfeeds.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.manufacturing.net/rss/all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.industryweek.com/rss/allarticles.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.hrmagazine.co.uk/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/feeds/topics/organizational-culture",
"dt": 1699830841,
"update": 1
},
{
"u": "https://cultureiq.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://officevibe.com/blog/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/feeds/topics/leadership",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mckinsey.com/rss/insights/leadership",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.strategy-business.com/rss/stratcon_all",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.gsb.stanford.edu/sites/default/files/feeds/cgri.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.navexglobal.com/en-us/company/blog/ethicscompliance/feed",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.accountingtoday.com/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.journalofaccountancy.com/rss.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.wsj.com/news/cfo-journal/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.bloomberg.com/professional/accounting-news/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.fm-magazine.com/news/rss.html",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.accountingweb.com/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.risk.net/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.sec.gov/rss/pressrel.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.riskmanagementmonitor.com/feed/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.isaca.org/resources/news-and-trends/isaca-now-blog/risk-management",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.mergermarket.com/rss/",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/professional/ma-news/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://pitchbook.com/news/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.wsj.com/news/business/deals",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.forbes.com/mergers-acquisitions/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.accountingtoday.com/tag/mergers-acquisitions",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.crowdfundinsider.com/category/equity-crowdfunding/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.crowdfundinsider.com/category/p2p-lending/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.fundable.com/blog/rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.startengine.com/blog/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.bloomberg.com/topics/international-trade/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.wto.org/rss/news_rss_e.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://unctad.org/news/rss",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.forbes.com/international-business/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.sba.gov/rss/news.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://quickbooks.intuit.com/r/feed/",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.score.org/blog/feed",
"dt": 1699830841,
"update": 0
},
{
"u": "https://www.americanbar.org/content/dam/aba/rss/news/abanews.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://www.natlawreview.com/rss-feed.xml",
"dt": 1699830841,
"update": 0
},
{
"u": "https://rss.nytimes.com/services/xml/rss/nyt/BusinessLaw.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://angel.co/blog.rss",
"dt": 1699830841,
"update": 1
},
{
"u": "https://hbr.org/technology/rss.xml",
"dt": 1699830841,
"update": 1
},
{
"u": "https://sloanreview.mit.edu/topics/innovation/feed/",
"dt": 1699830841,
"update": 0
}
]

View File

@@ -4,6 +4,10 @@ import sys
import json
from src.utils.retry_logic import retry_with_backoff
from botocore.exceptions import ClientError
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
load_dotenv(override=True)
@@ -190,5 +194,23 @@ def deploy_infrastructure():
}
])
if os.getenv("STORAGE_STRATEGY") == 'pinecone':
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_DB_NAME")
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=int(embedding_dim),
metric=vector_search_metric,
spec = ServerlessSpec(
cloud="aws",
region=os.getenv("AWS_REGION"),
),
)
if __name__ == "__main__":
deploy_infrastructure()

View File

@@ -1,13 +1,14 @@
import os
from pinecone import Pinecone
from openai import OpenAI
# Set up Pinecone client
api_key = os.getenv("PINCEONE_API_KEY")
shards = os.getenv("PINECONE_SHARDS")
embedding_model = os.getenv("VECTOR_EMBEDDING_MODEL")
embedding_dim = int(os.getenv("VECTOR_EMBEDDING_DIM"))
embedding_dim = os.getenv("VECTOR_EMBEDDING_DIM")
vector_search_metric = os.getenv("VECTOR_SEARCH_METRIC")
index_name = os.getenv("PINECONE_DB_NAME")
@@ -16,12 +17,7 @@ pc = Pinecone(api_key=api_key)
def get_index():
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=embedding_dim,
metric=vector_search_metric,
shards=shards
)
return KeyError(f"Index {index_name} not found")
index = pc.Index(index_name)
return index
@@ -29,7 +25,7 @@ def get_index():
def vectorize(article:str) -> list[float]:
response = client.embeddings.create(
input=article,
model=os.getenv('OPENAI_EMBEDDING_MODEL')
model=os.getenv('OPENAI_EMBEDDING_MODEL', 'text')
)
return response.data[0].embedding

View File

@@ -34,7 +34,6 @@ def pinecone_save_article(article:dict):
article["id"] = article["article_id"]
article["values"] = vectorize(article["content"])
namespace = os.getenv('PINECONE_NAMESPACE')
logger.info("Upserting article to Pinecone")

View File

@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
import logging
from string import Template
from tqdm import tqdm
class S3BatchDownloader:
"""Class for batch downloading RSS articles from S3"""
@@ -93,15 +94,17 @@ class S3BatchDownloader:
if self._is_in_date_range(obj['LastModified'], start_ts, end_ts)
]
self.logger.info(f"Found {len(objects)} objects to process")
print(f"Found {len(objects)} objects to process")
# Download and merge data
all_data = []
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor:
with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor, tqdm(total=len(objects), unit="object") as progress_bar:
future_to_obj = {executor.submit(self._download_object, obj): obj for obj in objects}
for future in as_completed(future_to_obj):
result = future.result()
if result is not None:
all_data.extend(result if isinstance(result, list) else [result])
progress_bar.update(1)
# Save to file
self._save_to_file(all_data, output_path, file_format)

View File

@@ -1,9 +1,15 @@
# After Public Launch
* Monthly Kaggle Dataset Publishing.
* Vector Database Initialization at earlier phase. [ Done ]
* Test out Vector Databases at Small Scale.
* [ ] Testing
* [ ] Fix OpenAI Error.
* [ ] Let it run for a day
* Test out Vector Databases at Scale.
* Monthly ingestion job
* Protocol for annotating data.
* Promotional Material
* Stuff that is not news