Solved: Re: How to run python in power query

charleshale · ‎10-21-2024

I have a python file in pycharm that takes a csv of CIKs (the EDGAR ticker symbol identifier) and scrapes the number of subsidiaries.

This is the script in case you want to know....but my issue isnt the script. It works fine in python.

import os
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import dateutil.parser  # Safely handle date parsing


def get_unique_filename(base_path, original_filename):
    filename, extension = os.path.splitext(original_filename)
    counter = 1
    new_filename = original_filename
    while os.path.exists(os.path.join(base_path, new_filename)):
        new_filename = f"{filename}_{counter}{extension}"
        counter += 1
    return new_filename


# Variables for file paths and web scraping
base_path = "C:\\Users\\charl\\Downloads"
original_filename = "results.csv"

# Ensure that fieldnames are defined
fieldnames = ['ticker', 'cik', 'company_name', 'sic_code', 'sic_description', 'filing_date', 'doc_link', 'ex21_link',
              'num_subsidiaries', 'subsidiaries_list', 'recent_8k_link', 'recent_8k_excerpt']


def get_recent_10k_links(ticker, cik, user_agent):
    print(f"Processing Ticker: {ticker}, CIK: {cik.zfill(10)}")  # Ensure CIK is zero-padded to 10 digits
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    headers = {'User-Agent': user_agent}
    params = {
        'action': 'getcompany',
        'CIK': cik.zfill(10),  # Ensure CIK is 10 digits long
        'type': '10-K',
        'owner': 'exclude',
        'output': 'atom',
        'count': 10,
    }

    # Construct the full URL and print it
    sec_url = requests.Request('GET', base_url, params=params).prepare().url
    print(f"Fetching SEC XML page from: {sec_url}")  # This will display the SEC XML page link in the log

    response = requests.get(sec_url, headers=headers)

    if response.status_code == 200:
        # Continue processing the XML as before...
        soup = BeautifulSoup(response.content, 'xml')

        # Get the company name from the <conformed-name> tag
        company_name_tag = soup.find('conformed-name')
        company_name = company_name_tag.get_text(strip=True) if company_name_tag else 'Not Found'

        # Get the SIC code from the <assigned-sic> tag
        sic_code_tag = soup.find('assigned-sic')
        sic_code = sic_code_tag.get_text(strip=True) if sic_code_tag else 'Not Found'

        # Optionally, get the SIC description from the <assigned-sic-desc> tag
        sic_desc_tag = soup.find('assigned-sic-desc')
        sic_desc = sic_desc_tag.get_text(strip=True) if sic_desc_tag else 'Not Found'

        # Get the 10-K filings
        entries = soup.find_all('entry')
        results = []
        one_year_ago = datetime.now() - timedelta(days=365)
        found = False
        for entry in entries:
            filing_date = entry.find('updated').text[:10]  # Extract the date
            filing_date = dateutil.parser.parse(filing_date)  # Use dateutil.parser to safely parse the date
            if filing_date > one_year_ago:
                found = True
                doc_link = entry.find('link')['href']
                ex21_link = fetch_exhibit_21_link(doc_link, headers)
                subsidiaries, subsidiaries_list = count_subsidiaries(ex21_link, headers) if ex21_link else (0, [])
                # Also fetch 8-K related data
                recent_8k_link, recent_8k_excerpt = fetch_recent_8k(cik, headers)
                results.append({
                    'ticker': ticker,
                    'cik': cik.zfill(10),
                    'company_name': company_name,
                    'sic_code': sic_code,
                    'filing_date': filing_date.date(),
                    'doc_link': doc_link,
                    'ex21_link': ex21_link or 'Not Found',
                    'num_subsidiaries': subsidiaries,
                    'subsidiaries_list': subsidiaries_list,
                    'sic_description': sic_desc,
                    'recent_8k_link': recent_8k_link,
                    'recent_8k_excerpt': recent_8k_excerpt
                })
        if not found:
            print(f"No recent 10-K filings found for Ticker: {ticker}, CIK: {cik}")
            results.append({
                'ticker': ticker,
                'cik': cik.zfill(10),
                'company_name': company_name,
                'sic_code': sic_code,
                'filing_date': 'No recent filings',
                'doc_link': 'Not Found',
                'ex21_link': 'Not Found',
                'num_subsidiaries': 'Not Found',
                'subsidiaries_list': 'Not Found',
                'sic_description': sic_desc,
                'recent_8k_link': 'Not Found',
                'recent_8k_excerpt': 'Not Found'
            })
        return results
    else:
        print(f"Failed to fetch data for Ticker: {ticker}, CIK: {cik}, HTTP status: {response.status_code}")
        return []


def fetch_recent_8k(cik, headers):
    # Build the URL for the search page for the recent 8-K with 5.02 and "chief executive"
    search_url = f"https://www.sec.gov/edgar/search/#/q=5.02%2520%2522chief%2520executive%2522&category=custom&ciks={cik.zfill(10)}&forms=8-K&sort=desc"
    print(f"Searching recent 8-K for CIK {cik.zfill(10)} at {search_url}")

    response = requests.get(search_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table_body = soup.find('tbody')

        if table_body:
            first_row = table_body.find('tr')
            if first_row:
                columns = first_row.find_all('td')
                if len(columns) > 0:
                    filing_link = columns[1].find('a')['href']  # Assume filing link is in the second column
                    excerpt = columns[2].get_text(strip=True)[:100]  # First 100 characters of the document text
                    full_filing_link = f"https://www.sec.gov{filing_link}" if not filing_link.startswith(
                        'http') else filing_link
                    print(f"Found recent 8-K filing link: {full_filing_link}")
                    return full_filing_link, excerpt
    print("No recent 8-K filings found for 5.02 and 'chief executive'.")
    return 'Not Found', 'Not Found'


def fetch_exhibit_21_link(doc_link, headers):
    print(f"Fetching document page from {doc_link}")
    response = requests.get(doc_link, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        for a in soup.find_all('a', href=True):
            if 'ex21' in a['href'].lower() or 'exhibit21' in a['href'].lower() or \
                    'exx21' in a['href'].lower() or 'exh-21' in a['href'].lower() or \
                    'listofsubsid' in a['href'].lower() or 'ex-211' in a['href'].lower() or \
                    'exhibit 21' in a.text.lower():
                href = a['href']
                if not href.startswith('http'):
                    href = f"https://www.sec.gov{href}"
                print(f"Exhibit 21 link found: {href}")
                return href
        print("Exhibit 21 link not found in this document.")
    else:
        print(f"Failed to fetch document page: HTTP {response.status_code}")
    return None


def count_subsidiaries(ex21_link, headers):
    if ex21_link != "Not Found":
        response = requests.get(ex21_link, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            rows = soup.find_all('tr')
            subsidiaries_list = [row.get_text(strip=True) for row in rows if row.find('td')]
            print(f"Counted {len(subsidiaries_list)} subsidiaries.")
            return len(subsidiaries_list), subsidiaries_list
    print("No subsidiaries found or invalid link.")
    return 0, []


def read_ticker_and_ciks_from_csv(file_path):
    with open(file_path, mode='r', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)  # Skip header
        return [(row[0], row[1]) for row in csvreader if row]


# File paths for CIKs
input_file_path = "C:\\Users\\charl\\OneDrive - Planck LLC d b a Patch Media\\SEC Data\\sampleciks.csv"
output_file_path = os.path.join(base_path, get_unique_filename(base_path, original_filename))

# Print the output file path to ensure correctness
print(f"Writing results to: {output_file_path}")

ticker_cik_list = read_ticker_and_ciks_from_csv(input_file_path)
user_agent = 'PlanckMedia SECDataScraper/1.0 (charles@haleglobal.com)'

# Max characters allowed in an Excel cell
max_chars = 32767

try:
    # Open CSV in write mode and write header once
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()  # Write header at the start of the file

        # Iterate through CIKs and write results incrementally
        for ticker, cik in ticker_cik_list:
            results = get_recent_10k_links(ticker, cik, user_agent)
            if not results:
                print(f"No results found for Ticker: {ticker}, CIK: {cik.zfill(10)}")
                results = [
                    {'ticker': ticker, 'cik': cik.zfill(10), 'company_name': 'Not Found', 'sic_code': 'Not Found',
                     'filing_date': 'No filings found', 'doc_link': 'Not Found',
                     'ex21_link': 'Not Found', 'num_subsidiaries': 'Not Found',
                     'subsidiaries_list': 'Not Found', 'recent_8k_link': 'Not Found',
                     'recent_8k_excerpt': 'Not Found'}]

            for result in results:
                if isinstance(result['subsidiaries_list'], list) and len(result['subsidiaries_list']) > max_chars:
                    result['subsidiaries_list'] = result['subsidiaries_list'][:max_chars]
                writer.writerow(result)  # Write row after processing each CIK

    print(f"Results successfully written to {output_file_path}")

except Exception as e:
    print(f"An error occurred: {e}")

print("Script completed and results are written to the file.")

But how to get PowerBI to run teh script in powerquery?? First, I imported the table that tells python the CIKSs to scrape as "sampleciks" in powerquery

The tricky part is that I can't figure out how to get a new query to launch python. Is it like R, such that it would look like this? If so what is the command to run a python script? OpenAI has ridiculous answers that make no sense

let
    Source = sampleciks, 
    // Running an R script     PASTE R SCRIPT FROM VISUAL IN HERE
    RunPScript = P.Execute("
    
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import dateutil.parser

# Function to fetch recent 10-K filings from SEC
def get_recent_10k_links(cik, user_agent):
    base_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
    headers = {'User-Agent': user_agent}
    params = {
        'action': 'getcompany',
        'CIK': cik.zfill(10),  # Ensure CIK is zero-padded to 10 digits
        'type': '10-K',
        'owner': 'exclude',
        'output': 'atom',
        'count': 10,
    }

    # Send a GET request to the SEC site
    response = requests.get(base_url, params=params, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'xml')

        # Get company details
        company_name_tag = soup.find('conformed-name')
        company_name = company_name_tag.get_text(strip=True) if company_name_tag else 'Not Found'

        sic_code_tag = soup.find('assigned-sic')
        sic_code = sic_code_tag.get_text(strip=True) if sic_code_tag else 'Not Found'

        sic_desc_tag = soup.find('assigned-sic-desc')
        sic_desc = sic_desc_tag.get_text(strip=True) if sic_desc_tag else 'Not Found'

        # Extract filings
        entries = soup.find_all('entry')
        results = []
        one_year_ago = datetime.now() - timedelta(days=365)

        for entry in entries:
            filing_date = entry.find('updated').text[:10]
            filing_date = dateutil.parser.parse(filing_date)
            if filing_date > one_year_ago:
                doc_link = entry.find('link')['href']
                results.append({
                    'cik': cik.zfill(10),
                    'company_name': company_name,
                    'sic_code': sic_code,
                    'filing_date': filing_date.date(),
                    'doc_link': doc_link,
                    'sic_description': sic_desc
                })
        return results
    else:
        return [{'cik': cik, 'company_name': 'Not Found', 'sic_code': 'Not Found', 'filing_date': 'No recent filings', 'doc_link': 'Not Found', 'sic_description': 'Not Found'}]

# Set user agent
user_agent = 'PlanckMedia SECDataScraper/1.0 (charles@haleglobal.com)'

# Initialize empty list for results
results_list = []

# Iterate through each row of the 'dataset' (which is automatically passed from Power Query)
for index, row in dataset.iterrows():
    cik = str(row['cik'])  # Extract CIK from the row
    results = get_recent_10k_links(cik, user_agent)  # Call the SEC fetching function
    for result in results:
        results_list.append(result)




", [dataset=#"Replaced Value1"]),
    
    // Extract the output table from the list
    OutputTable = RunPScript{0}[Value] 
in
    OutputTable

TomMartens · ‎10-22-2024

Hey @charleshale ,

if the BeautifulSoup package is not supported by the PowerBI Service, then there is a problem.

Maybe it's possible to create a Python App running in Azure or an Python function. Let's create this App or Function do the data extraction create JSON document, a csv, or a Parquet file then will be consumed by Power BI.

Hopefully, this provides an idea of how to tackle your challenge.

Regards,

Tom

Did I answer your question? Mark my post as a solution, this will help others!

Proud to be a Super User!
I accept Kudos 😉
Hamburg, Germany

View solution in original post

TomMartens · ‎10-22-2024

Hey @charleshale ,

if the BeautifulSoup package is not supported by the PowerBI Service, then there is a problem.

Maybe it's possible to create a Python App running in Azure or an Python function. Let's create this App or Function do the data extraction create JSON document, a csv, or a Parquet file then will be consumed by Power BI.

Hopefully, this provides an idea of how to tackle your challenge.

Regards,

Tom

Did I answer your question? Mark my post as a solution, this will help others!

Proud to be a Super User!
I accept Kudos 😉
Hamburg, Germany

TomMartens · ‎10-21-2024

Hey @charleshale ,

check this document: https://learn.microsoft.com/en-us/power-bi/connect-data/desktop-python-scripts

Regards,

Tom

Did I answer your question? Mark my post as a solution, this will help others!

Proud to be a Super User!
I accept Kudos 😉
Hamburg, Germany

charleshale · ‎10-22-2024

Thank you, @TomMartens. It looks like then that PowerBI only supports pandas. Therefore the issue is that a beautifulsoup data harvest isnt supported. Does that sound right?

How to run python in power query

Helpful resources

Fabric Data Days

Power BI Monthly Update - October 2025

FabCon Atlanta 2026

How to Get Your Question Answered Quickly

Fabric Data Days starts November 4th!

How to run python in power query

Helpful resources

Fabric Data Days

Power BI Monthly Update - October 2025

FabCon Atlanta 2026

How to Get Your Question Answered Quickly