Re: Ingest all docs from SharePoint to Lakehouse

alison_pezzott · ‎09-12-2025

This notebook demonstrates the ingestion of all docs from a SharePoint site into Landing layer in the Lakehouse.

I used a Service Principal for authentication and followed these steps:

Create an App Registration in Microsoft Entra
In API permissions, grant the Microsoft Graph permissions Sites.Read.All and Sites.Selected, and ensure consent is given
Create the secret and configure it in Key Vault

# Step 0: Configure Spark to treat table and column names as case-sensitive
spark.conf.set('spark.sql.caseSensitive', True)

# Step 1: Define SharePoint site and document library details
TENANT_HOST         = "overdax.sharepoint.com"  # SharePoint Host
SITE_PATH           = "Sandbox"                 # Site path
DOCS_DRIVE          = "Documents"               # Documents folder path name
ROOT_FOLDER         = None                      # Optional start subfolder
TARGET_LANDING_PATH = "Files/Landing"           # Lakehouse landing path

# Step 2: Retrieve secrets from Azure Key Vault
KEY_VAULT     = "https://pezzott.vault.azure.net/"
CLIENT_ID     = notebookutils.credentials.getSecret(KEY_VAULT, "graph-client-id")
TENANT_ID     = notebookutils.credentials.getSecret(KEY_VAULT, "graph-tenant-id")
CLIENT_SECRET = notebookutils.credentials.getSecret(KEY_VAULT, "graph-secret")

# Step 3: Acquire access token for Microsoft Graph API
import requests

token_url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
token_resp = requests.post(
    token_url,
    data={
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "grant_type": "client_credentials",
        "scope": "https://graph.microsoft.com/.default",  # Graph app perms
    },
)
token_resp.raise_for_status()
access_token = token_resp.json()["access_token"]
headers = {"Authorization": f"Bearer {access_token}", "Accept": "application/json"}

# Step 4: Locate SharePoint document library by name
site = requests.get(
    f"https://graph.microsoft.com/v1.0/sites/{TENANT_HOST}:{SITE_PATH}",
    headers=headers
).json()
site_id = site["id"]

drives = requests.get(f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives?$select=id,name,driveType", headers=headers).json()
drive_id = next(d["id"] for d in drives.get("value", []) if d["name"] == DOCS_DRIVE)
print(f"Drive ID: {drive_id}")

# Step 5: List all files in the SharePoint document library (recursive traversal)
def list_children(drive_id, item_id):
    url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}/children?$top=200"
    while url:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        data = r.json()
        for it in data.get("value", []):
            yield it
        url = data.get("@odata.nextLink")

# Resolve the root of the SharePoint drive
root = requests.get(f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root", headers=headers).json()
root_id = root["id"]

# If a root folder is specified, resolve it first
if ROOT_FOLDER:
    path = f"/{ROOT_FOLDER}".strip("/")
    folder_item = requests.get(
        f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{quote(path)}",
        headers=headers
    ).json()
    base_id = folder_item["id"]
else:
    base_id = root_id

# Traverse all folders and collect every file (no file extension filter)
all_items = []
stack = [base_id]
while stack:
    current = stack.pop()
    for it in list_children(drive_id, current):
        if "folder" in it:
            stack.append(it["id"])
        elif "file" in it:
            all_items.append(it)

print(f"Found {len(all_items)} files.")

# Utility functions for Lakehouse operations
import os, uuid, pathlib

def ensure_dir(dir_path: str):
    if dir_path and not notebookutils.fs.exists(dir_path):
        notebookutils.fs.mkdirs(dir_path)

def save_binary_to_lakehouse(target_rel_path: str, content: bytes, overwrite: bool = True):
    """
    Save files to lakehouse coping from a temp path.
    """
    tmp_dir = "/tmp"
    os.makedirs(tmp_dir, exist_ok=True)
    tmp_path = os.path.join(tmp_dir, f"{uuid.uuid4()}_{pathlib.Path(target_rel_path).name}")
    with open(tmp_path, "wb") as f:
        f.write(content)

    parent = "/".join(target_rel_path.split("/")[:-1])
    ensure_dir(parent)

    if overwrite and notebookutils.fs.exists(target_rel_path):
        notebookutils.fs.rm(target_rel_path)

    notebookutils.fs.cp(f"file:{tmp_path}", target_rel_path)

    try:
        os.remove(tmp_path)
    except Exception:
        pass

def relative_parent_from_item(item, root_folder: str | None = None) -> str:
    """
    Convert parentReference.path to short relative path.
    Removes any prefix '.../root:' (ex.: '/drive/root:' OU '/drives//root:')
    and, if exists, removes the ROOT_FOLDER from the start too.
    """
    p = item.get("parentReference", {}).get("path", "")
    # Possible examples: 
    # '/drive/root:/Test/Sub'
    # '/drives/b!abc123/root:/Test/Sub'
    # '/drives/b!abc123/root:'  (arquivo diretamente na raiz)

    if "root:" in p:
        # Take all after 'root:'
        p = p.split("root:", 1)[1]   # -> ': /Test/Sub' or '' (if root)

    # Clean initial ':' and slashes
    p = p.lstrip(":").lstrip("/")    # -> 'Test/Sub' or ''

    # Remove ROOT_FOLDER from start, if defined
    if root_folder:
        rf = root_folder.strip("/")
        # Case-insensitive comparision
        if p.lower() == rf.lower():
            p = ""
        elif p.lower().startswith(rf.lower() + "/"):
            p = p[len(rf) + 1 :]

    return p

# Step 6: Download files from SharePoint and land them into the Lakehouse
landed_paths = []
for item in all_items:
    # Try downloadUrl (pre-assigned); else use content
    download_url = item.get("@microsoft.graph.downloadUrl")
    if download_url:
        resp = requests.get(download_url)
        resp.raise_for_status()
        content = resp.content
    else:
        content_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item['id']}/content"
        resp = requests.get(content_url, headers=headers)
        resp.raise_for_status()
        content = resp.content

    # Short path on Files/Landing (without no drives//root:)
    relative_parent = relative_parent_from_item(item, ROOT_FOLDER)
    if relative_parent:
        rel = f"{TARGET_LANDING_PATH}/{relative_parent}/{item['name']}"
    else:
        rel = f"{TARGET_LANDING_PATH}/{item['name']}"
    save_binary_to_lakehouse(rel, content, overwrite=True)
    landed_paths.append(rel)

print(f"Landed {len(landed_paths)} files into Lakehouse.")

https%3A%2F%2Fgithub.com%2Falisonpezzott%2Fuseful-snippets%2Fblob%2Fmain%2Ffabric_notebooks%2Fingest_sharepoint_docs.ipynb

garddolau · ‎10-21-2025

This is really good, thanks a lot for posting...

I had to make a couple of changes to review below.

Would also be great to add a flag to set if you wanted to recurse though sub folders for not.

(a) When ROOT_FOLDER is set I had to include this import to stop error about quote: "from urllib.parse import quote"

(b) I found that when SITE_PATH was set it was ignored and it allways just used the default site. So I updated Step 4 to be:

if SITE_PATH:

site_url = f"https://graph.microsoft.com/v1.0/sites/{TENANT_HOST}:/sites/{SITE_PATH}"

else:

site_url = f"https://graph.microsoft.com/v1.0/sites/{TENANT_HOST}:{SITE_PATH}"

site = requests.get(

site_url,

headers=headers

).json()

MicheleParisi · ‎10-03-2025

Thanks! I had to tweak it a bit to have it find my files, but worked like a charm!

FabCon is coming to Atlanta

Re: Ingest all docs from SharePoint to Lakehouse

Ingest all docs from SharePoint to Lakehouse