Advance your Data & AI career with 50 days of live learning, dataviz contests, hands-on challenges, study groups & certifications and more!
Get registeredGet Fabric Certified for FREE during Fabric Data Days. Don't miss your chance! Request now
I have a python file in pycharm that takes a csv of CIKs (the EDGAR ticker symbol identifier) and scrapes the number of subsidiaries.
This is the script in case you want to know....but my issue isnt the script. It works fine in python.
import os
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import dateutil.parser # Safely handle date parsing
def get_unique_filename(base_path, original_filename):
filename, extension = os.path.splitext(original_filename)
counter = 1
new_filename = original_filename
while os.path.exists(os.path.join(base_path, new_filename)):
new_filename = f"{filename}_{counter}{extension}"
counter += 1
return new_filename
# Variables for file paths and web scraping
base_path = "C:\\Users\\charl\\Downloads"
original_filename = "results.csv"
# Ensure that fieldnames are defined
fieldnames = ['ticker', 'cik', 'company_name', 'sic_code', 'sic_description', 'filing_date', 'doc_link', 'ex21_link',
'num_subsidiaries', 'subsidiaries_list', 'recent_8k_link', 'recent_8k_excerpt']
def get_recent_10k_links(ticker, cik, user_agent):
print(f"Processing Ticker: {ticker}, CIK: {cik.zfill(10)}") # Ensure CIK is zero-padded to 10 digits
base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
headers = {'User-Agent': user_agent}
params = {
'action': 'getcompany',
'CIK': cik.zfill(10), # Ensure CIK is 10 digits long
'type': '10-K',
'owner': 'exclude',
'output': 'atom',
'count': 10,
}
# Construct the full URL and print it
sec_url = requests.Request('GET', base_url, params=params).prepare().url
print(f"Fetching SEC XML page from: {sec_url}") # This will display the SEC XML page link in the log
response = requests.get(sec_url, headers=headers)
if response.status_code == 200:
# Continue processing the XML as before...
soup = BeautifulSoup(response.content, 'xml')
# Get the company name from the <conformed-name> tag
company_name_tag = soup.find('conformed-name')
company_name = company_name_tag.get_text(strip=True) if company_name_tag else 'Not Found'
# Get the SIC code from the <assigned-sic> tag
sic_code_tag = soup.find('assigned-sic')
sic_code = sic_code_tag.get_text(strip=True) if sic_code_tag else 'Not Found'
# Optionally, get the SIC description from the <assigned-sic-desc> tag
sic_desc_tag = soup.find('assigned-sic-desc')
sic_desc = sic_desc_tag.get_text(strip=True) if sic_desc_tag else 'Not Found'
# Get the 10-K filings
entries = soup.find_all('entry')
results = []
one_year_ago = datetime.now() - timedelta(days=365)
found = False
for entry in entries:
filing_date = entry.find('updated').text[:10] # Extract the date
filing_date = dateutil.parser.parse(filing_date) # Use dateutil.parser to safely parse the date
if filing_date > one_year_ago:
found = True
doc_link = entry.find('link')['href']
ex21_link = fetch_exhibit_21_link(doc_link, headers)
subsidiaries, subsidiaries_list = count_subsidiaries(ex21_link, headers) if ex21_link else (0, [])
# Also fetch 8-K related data
recent_8k_link, recent_8k_excerpt = fetch_recent_8k(cik, headers)
results.append({
'ticker': ticker,
'cik': cik.zfill(10),
'company_name': company_name,
'sic_code': sic_code,
'filing_date': filing_date.date(),
'doc_link': doc_link,
'ex21_link': ex21_link or 'Not Found',
'num_subsidiaries': subsidiaries,
'subsidiaries_list': subsidiaries_list,
'sic_description': sic_desc,
'recent_8k_link': recent_8k_link,
'recent_8k_excerpt': recent_8k_excerpt
})
if not found:
print(f"No recent 10-K filings found for Ticker: {ticker}, CIK: {cik}")
results.append({
'ticker': ticker,
'cik': cik.zfill(10),
'company_name': company_name,
'sic_code': sic_code,
'filing_date': 'No recent filings',
'doc_link': 'Not Found',
'ex21_link': 'Not Found',
'num_subsidiaries': 'Not Found',
'subsidiaries_list': 'Not Found',
'sic_description': sic_desc,
'recent_8k_link': 'Not Found',
'recent_8k_excerpt': 'Not Found'
})
return results
else:
print(f"Failed to fetch data for Ticker: {ticker}, CIK: {cik}, HTTP status: {response.status_code}")
return []
def fetch_recent_8k(cik, headers):
# Build the URL for the search page for the recent 8-K with 5.02 and "chief executive"
search_url = f"https://www.sec.gov/edgar/search/#/q=5.02%2520%2522chief%2520executive%2522&category=custom&ciks={cik.zfill(10)}&forms=8-K&sort=desc"
print(f"Searching recent 8-K for CIK {cik.zfill(10)} at {search_url}")
response = requests.get(search_url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
table_body = soup.find('tbody')
if table_body:
first_row = table_body.find('tr')
if first_row:
columns = first_row.find_all('td')
if len(columns) > 0:
filing_link = columns[1].find('a')['href'] # Assume filing link is in the second column
excerpt = columns[2].get_text(strip=True)[:100] # First 100 characters of the document text
full_filing_link = f"https://www.sec.gov{filing_link}" if not filing_link.startswith(
'http') else filing_link
print(f"Found recent 8-K filing link: {full_filing_link}")
return full_filing_link, excerpt
print("No recent 8-K filings found for 5.02 and 'chief executive'.")
return 'Not Found', 'Not Found'
def fetch_exhibit_21_link(doc_link, headers):
print(f"Fetching document page from {doc_link}")
response = requests.get(doc_link, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
for a in soup.find_all('a', href=True):
if 'ex21' in a['href'].lower() or 'exhibit21' in a['href'].lower() or \
'exx21' in a['href'].lower() or 'exh-21' in a['href'].lower() or \
'listofsubsid' in a['href'].lower() or 'ex-211' in a['href'].lower() or \
'exhibit 21' in a.text.lower():
href = a['href']
if not href.startswith('http'):
href = f"https://www.sec.gov{href}"
print(f"Exhibit 21 link found: {href}")
return href
print("Exhibit 21 link not found in this document.")
else:
print(f"Failed to fetch document page: HTTP {response.status_code}")
return None
def count_subsidiaries(ex21_link, headers):
if ex21_link != "Not Found":
response = requests.get(ex21_link, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
rows = soup.find_all('tr')
subsidiaries_list = [row.get_text(strip=True) for row in rows if row.find('td')]
print(f"Counted {len(subsidiaries_list)} subsidiaries.")
return len(subsidiaries_list), subsidiaries_list
print("No subsidiaries found or invalid link.")
return 0, []
def read_ticker_and_ciks_from_csv(file_path):
with open(file_path, mode='r', newline='') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader) # Skip header
return [(row[0], row[1]) for row in csvreader if row]
# File paths for CIKs
input_file_path = "C:\\Users\\charl\\OneDrive - Planck LLC d b a Patch Media\\SEC Data\\sampleciks.csv"
output_file_path = os.path.join(base_path, get_unique_filename(base_path, original_filename))
# Print the output file path to ensure correctness
print(f"Writing results to: {output_file_path}")
ticker_cik_list = read_ticker_and_ciks_from_csv(input_file_path)
user_agent = 'PlanckMedia SECDataScraper/1.0 (charles@haleglobal.com)'
# Max characters allowed in an Excel cell
max_chars = 32767
try:
# Open CSV in write mode and write header once
with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader() # Write header at the start of the file
# Iterate through CIKs and write results incrementally
for ticker, cik in ticker_cik_list:
results = get_recent_10k_links(ticker, cik, user_agent)
if not results:
print(f"No results found for Ticker: {ticker}, CIK: {cik.zfill(10)}")
results = [
{'ticker': ticker, 'cik': cik.zfill(10), 'company_name': 'Not Found', 'sic_code': 'Not Found',
'filing_date': 'No filings found', 'doc_link': 'Not Found',
'ex21_link': 'Not Found', 'num_subsidiaries': 'Not Found',
'subsidiaries_list': 'Not Found', 'recent_8k_link': 'Not Found',
'recent_8k_excerpt': 'Not Found'}]
for result in results:
if isinstance(result['subsidiaries_list'], list) and len(result['subsidiaries_list']) > max_chars:
result['subsidiaries_list'] = result['subsidiaries_list'][:max_chars]
writer.writerow(result) # Write row after processing each CIK
print(f"Results successfully written to {output_file_path}")
except Exception as e:
print(f"An error occurred: {e}")
print("Script completed and results are written to the file.")
But how to get PowerBI to run teh script in powerquery?? First, I imported the table that tells python the CIKSs to scrape as "sampleciks" in powerquery
The tricky part is that I can't figure out how to get a new query to launch python. Is it like R, such that it would look like this? If so what is the command to run a python script? OpenAI has ridiculous answers that make no sense
let
Source = sampleciks,
// Running an R script PASTE R SCRIPT FROM VISUAL IN HERE
RunPScript = P.Execute("
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import dateutil.parser
# Function to fetch recent 10-K filings from SEC
def get_recent_10k_links(cik, user_agent):
base_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
headers = {'User-Agent': user_agent}
params = {
'action': 'getcompany',
'CIK': cik.zfill(10), # Ensure CIK is zero-padded to 10 digits
'type': '10-K',
'owner': 'exclude',
'output': 'atom',
'count': 10,
}
# Send a GET request to the SEC site
response = requests.get(base_url, params=params, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'xml')
# Get company details
company_name_tag = soup.find('conformed-name')
company_name = company_name_tag.get_text(strip=True) if company_name_tag else 'Not Found'
sic_code_tag = soup.find('assigned-sic')
sic_code = sic_code_tag.get_text(strip=True) if sic_code_tag else 'Not Found'
sic_desc_tag = soup.find('assigned-sic-desc')
sic_desc = sic_desc_tag.get_text(strip=True) if sic_desc_tag else 'Not Found'
# Extract filings
entries = soup.find_all('entry')
results = []
one_year_ago = datetime.now() - timedelta(days=365)
for entry in entries:
filing_date = entry.find('updated').text[:10]
filing_date = dateutil.parser.parse(filing_date)
if filing_date > one_year_ago:
doc_link = entry.find('link')['href']
results.append({
'cik': cik.zfill(10),
'company_name': company_name,
'sic_code': sic_code,
'filing_date': filing_date.date(),
'doc_link': doc_link,
'sic_description': sic_desc
})
return results
else:
return [{'cik': cik, 'company_name': 'Not Found', 'sic_code': 'Not Found', 'filing_date': 'No recent filings', 'doc_link': 'Not Found', 'sic_description': 'Not Found'}]
# Set user agent
user_agent = 'PlanckMedia SECDataScraper/1.0 (charles@haleglobal.com)'
# Initialize empty list for results
results_list = []
# Iterate through each row of the 'dataset' (which is automatically passed from Power Query)
for index, row in dataset.iterrows():
cik = str(row['cik']) # Extract CIK from the row
results = get_recent_10k_links(cik, user_agent) # Call the SEC fetching function
for result in results:
results_list.append(result)
", [dataset=#"Replaced Value1"]),
// Extract the output table from the list
OutputTable = RunPScript{0}[Value]
in
OutputTable
Solved! Go to Solution.
Hey @charleshale ,
if the BeautifulSoup package is not supported by the PowerBI Service, then there is a problem.
Maybe it's possible to create a Python App running in Azure or an Python function. Let's create this App or Function do the data extraction create JSON document, a csv, or a Parquet file then will be consumed by Power BI.
Hopefully, this provides an idea of how to tackle your challenge.
Regards,
Tom
Hey @charleshale ,
if the BeautifulSoup package is not supported by the PowerBI Service, then there is a problem.
Maybe it's possible to create a Python App running in Azure or an Python function. Let's create this App or Function do the data extraction create JSON document, a csv, or a Parquet file then will be consumed by Power BI.
Hopefully, this provides an idea of how to tackle your challenge.
Regards,
Tom
Hey @charleshale ,
check this document: https://learn.microsoft.com/en-us/power-bi/connect-data/desktop-python-scripts
Regards,
Tom
Thank you, @TomMartens. It looks like then that PowerBI only supports pandas. Therefore the issue is that a beautifulsoup data harvest isnt supported. Does that sound right?
Advance your Data & AI career with 50 days of live learning, contests, hands-on challenges, study groups & certifications and more!
Check out the October 2025 Power BI update to learn about new features.