Join us at FabCon Atlanta from March 16 - 20, 2026, for the ultimate Fabric, Power BI, AI and SQL community-led event. Save $200 with code FABCOMM.
Register now!Get Fabric Certified for FREE during Fabric Data Days. Don't miss your chance! Request now
10-03-2025 09:24 AM
# Step 0: Configure Spark to treat table and column names as case-sensitive
spark.conf.set('spark.sql.caseSensitive', True)
# Step 1: Define SharePoint site and document library details
TENANT_HOST = "overdax.sharepoint.com" # SharePoint Host
SITE_PATH = "Sandbox" # Site path
DOCS_DRIVE = "Documents" # Documents folder path name
ROOT_FOLDER = None # Optional start subfolder
TARGET_LANDING_PATH = "Files/Landing" # Lakehouse landing path # Step 2: Retrieve secrets from Azure Key Vault
KEY_VAULT = "https://pezzott.vault.azure.net/"
CLIENT_ID = notebookutils.credentials.getSecret(KEY_VAULT, "graph-client-id")
TENANT_ID = notebookutils.credentials.getSecret(KEY_VAULT, "graph-tenant-id")
CLIENT_SECRET = notebookutils.credentials.getSecret(KEY_VAULT, "graph-secret")# Step 3: Acquire access token for Microsoft Graph API
import requests
token_url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
token_resp = requests.post(
token_url,
data={
"client_id": CLIENT_ID,
"client_secret": CLIENT_SECRET,
"grant_type": "client_credentials",
"scope": "https://graph.microsoft.com/.default", # Graph app perms
},
)
token_resp.raise_for_status()
access_token = token_resp.json()["access_token"]
headers = {"Authorization": f"Bearer {access_token}", "Accept": "application/json"}# Step 4: Locate SharePoint document library by name
site = requests.get(
f"https://graph.microsoft.com/v1.0/sites/{TENANT_HOST}:{SITE_PATH}",
headers=headers
).json()
site_id = site["id"]
drives = requests.get(f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives?$select=id,name,driveType", headers=headers).json()
drive_id = next(d["id"] for d in drives.get("value", []) if d["name"] == DOCS_DRIVE)
print(f"Drive ID: {drive_id}")# Step 5: List all files in the SharePoint document library (recursive traversal)
def list_children(drive_id, item_id):
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}/children?$top=200"
while url:
r = requests.get(url, headers=headers)
r.raise_for_status()
data = r.json()
for it in data.get("value", []):
yield it
url = data.get("@odata.nextLink")
# Resolve the root of the SharePoint drive
root = requests.get(f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root", headers=headers).json()
root_id = root["id"]
# If a root folder is specified, resolve it first
if ROOT_FOLDER:
path = f"/{ROOT_FOLDER}".strip("/")
folder_item = requests.get(
f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{quote(path)}",
headers=headers
).json()
base_id = folder_item["id"]
else:
base_id = root_id
# Traverse all folders and collect every file (no file extension filter)
all_items = []
stack = [base_id]
while stack:
current = stack.pop()
for it in list_children(drive_id, current):
if "folder" in it:
stack.append(it["id"])
elif "file" in it:
all_items.append(it)
print(f"Found {len(all_items)} files.")# Utility functions for Lakehouse operations
import os, uuid, pathlib
def ensure_dir(dir_path: str):
if dir_path and not notebookutils.fs.exists(dir_path):
notebookutils.fs.mkdirs(dir_path)
def save_binary_to_lakehouse(target_rel_path: str, content: bytes, overwrite: bool = True):
"""
Save files to lakehouse coping from a temp path.
"""
tmp_dir = "/tmp"
os.makedirs(tmp_dir, exist_ok=True)
tmp_path = os.path.join(tmp_dir, f"{uuid.uuid4()}_{pathlib.Path(target_rel_path).name}")
with open(tmp_path, "wb") as f:
f.write(content)
parent = "/".join(target_rel_path.split("/")[:-1])
ensure_dir(parent)
if overwrite and notebookutils.fs.exists(target_rel_path):
notebookutils.fs.rm(target_rel_path)
notebookutils.fs.cp(f"file:{tmp_path}", target_rel_path)
try:
os.remove(tmp_path)
except Exception:
pass
def relative_parent_from_item(item, root_folder: str | None = None) -> str:
"""
Convert parentReference.path to short relative path.
Removes any prefix '.../root:' (ex.: '/drive/root:' OU '/drives//root:')
and, if exists, removes the ROOT_FOLDER from the start too.
"""
p = item.get("parentReference", {}).get("path", "")
# Possible examples:
# '/drive/root:/Test/Sub'
# '/drives/b!abc123/root:/Test/Sub'
# '/drives/b!abc123/root:' (arquivo diretamente na raiz)
if "root:" in p:
# Take all after 'root:'
p = p.split("root:", 1)[1] # -> ': /Test/Sub' or '' (if root)
# Clean initial ':' and slashes
p = p.lstrip(":").lstrip("/") # -> 'Test/Sub' or ''
# Remove ROOT_FOLDER from start, if defined
if root_folder:
rf = root_folder.strip("/")
# Case-insensitive comparision
if p.lower() == rf.lower():
p = ""
elif p.lower().startswith(rf.lower() + "/"):
p = p[len(rf) + 1 :]
return p# Step 6: Download files from SharePoint and land them into the Lakehouse
landed_paths = []
for item in all_items:
# Try downloadUrl (pre-assigned); else use content
download_url = item.get("@microsoft.graph.downloadUrl")
if download_url:
resp = requests.get(download_url)
resp.raise_for_status()
content = resp.content
else:
content_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item['id']}/content"
resp = requests.get(content_url, headers=headers)
resp.raise_for_status()
content = resp.content
# Short path on Files/Landing (without no drives//root:)
relative_parent = relative_parent_from_item(item, ROOT_FOLDER)
if relative_parent:
rel = f"{TARGET_LANDING_PATH}/{relative_parent}/{item['name']}"
else:
rel = f"{TARGET_LANDING_PATH}/{item['name']}"
save_binary_to_lakehouse(rel, content, overwrite=True)
landed_paths.append(rel)
print(f"Landed {len(landed_paths)} files into Lakehouse.")https%3A%2F%2Fgithub.com%2Falisonpezzott%2Fuseful-snippets%2Fblob%2Fmain%2Ffabric_notebooks%2Fingest_sharepoint_docs.ipynb
This is really good, thanks a lot for posting...
I had to make a couple of changes to review below.
Would also be great to add a flag to set if you wanted to recurse though sub folders for not.
(a) When ROOT_FOLDER is set I had to include this import to stop error about quote: "from urllib.parse import quote"
(b) I found that when SITE_PATH was set it was ignored and it allways just used the default site. So I updated Step 4 to be:
Thanks! I had to tweak it a bit to have it find my files, but worked like a charm!