Don't miss your chance to take exam DP-600 or DP-700 on us!
Request nowLearn from the best! Meet the four finalists headed to the FINALS of the Power BI Dataviz World Championships! Register now
Your file has been submitted successfully. We’re processing it now - please check back in a few minutes to view your report.
09-12-2025 10:10 AM
# Step 0: Configure Spark to treat table and column names as case-sensitive
spark.conf.set('spark.sql.caseSensitive', True)
# Step 1: Define SharePoint site and document library details
TENANT_HOST = "overdax.sharepoint.com" # SharePoint Host
SITE_PATH = "Sandbox" # Site path
DOCS_DRIVE = "Documents" # Documents folder path name
ROOT_FOLDER = None # Optional start subfolder
TARGET_LANDING_PATH = "Files/Landing" # Lakehouse landing path # Step 2: Retrieve secrets from Azure Key Vault
KEY_VAULT = "https://pezzott.vault.azure.net/"
CLIENT_ID = notebookutils.credentials.getSecret(KEY_VAULT, "graph-client-id")
TENANT_ID = notebookutils.credentials.getSecret(KEY_VAULT, "graph-tenant-id")
CLIENT_SECRET = notebookutils.credentials.getSecret(KEY_VAULT, "graph-secret")# Step 3: Acquire access token for Microsoft Graph API
import requests
token_url = f"https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token"
token_resp = requests.post(
token_url,
data={
"client_id": CLIENT_ID,
"client_secret": CLIENT_SECRET,
"grant_type": "client_credentials",
"scope": "https://graph.microsoft.com/.default", # Graph app perms
},
)
token_resp.raise_for_status()
access_token = token_resp.json()["access_token"]
headers = {"Authorization": f"Bearer {access_token}", "Accept": "application/json"}# Step 4: Locate SharePoint document library by name
site = requests.get(
f"https://graph.microsoft.com/v1.0/sites/{TENANT_HOST}:{SITE_PATH}",
headers=headers
).json()
site_id = site["id"]
drives = requests.get(f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives?$select=id,name,driveType", headers=headers).json()
drive_id = next(d["id"] for d in drives.get("value", []) if d["name"] == DOCS_DRIVE)
print(f"Drive ID: {drive_id}")# Step 5: List all files in the SharePoint document library (recursive traversal)
def list_children(drive_id, item_id):
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}/children?$top=200"
while url:
r = requests.get(url, headers=headers)
r.raise_for_status()
data = r.json()
for it in data.get("value", []):
yield it
url = data.get("@odata.nextLink")
# Resolve the root of the SharePoint drive
root = requests.get(f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root", headers=headers).json()
root_id = root["id"]
# If a root folder is specified, resolve it first
if ROOT_FOLDER:
path = f"/{ROOT_FOLDER}".strip("/")
folder_item = requests.get(
f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{quote(path)}",
headers=headers
).json()
base_id = folder_item["id"]
else:
base_id = root_id
# Traverse all folders and collect every file (no file extension filter)
all_items = []
stack = [base_id]
while stack:
current = stack.pop()
for it in list_children(drive_id, current):
if "folder" in it:
stack.append(it["id"])
elif "file" in it:
all_items.append(it)
print(f"Found {len(all_items)} files.")# Utility functions for Lakehouse operations
import os, uuid, pathlib
def ensure_dir(dir_path: str):
if dir_path and not notebookutils.fs.exists(dir_path):
notebookutils.fs.mkdirs(dir_path)
def save_binary_to_lakehouse(target_rel_path: str, content: bytes, overwrite: bool = True):
"""
Save files to lakehouse coping from a temp path.
"""
tmp_dir = "/tmp"
os.makedirs(tmp_dir, exist_ok=True)
tmp_path = os.path.join(tmp_dir, f"{uuid.uuid4()}_{pathlib.Path(target_rel_path).name}")
with open(tmp_path, "wb") as f:
f.write(content)
parent = "/".join(target_rel_path.split("/")[:-1])
ensure_dir(parent)
if overwrite and notebookutils.fs.exists(target_rel_path):
notebookutils.fs.rm(target_rel_path)
notebookutils.fs.cp(f"file:{tmp_path}", target_rel_path)
try:
os.remove(tmp_path)
except Exception:
pass
def relative_parent_from_item(item, root_folder: str | None = None) -> str:
"""
Convert parentReference.path to short relative path.
Removes any prefix '.../root:' (ex.: '/drive/root:' OU '/drives//root:')
and, if exists, removes the ROOT_FOLDER from the start too.
"""
p = item.get("parentReference", {}).get("path", "")
# Possible examples:
# '/drive/root:/Test/Sub'
# '/drives/b!abc123/root:/Test/Sub'
# '/drives/b!abc123/root:' (arquivo diretamente na raiz)
if "root:" in p:
# Take all after 'root:'
p = p.split("root:", 1)[1] # -> ': /Test/Sub' or '' (if root)
# Clean initial ':' and slashes
p = p.lstrip(":").lstrip("/") # -> 'Test/Sub' or ''
# Remove ROOT_FOLDER from start, if defined
if root_folder:
rf = root_folder.strip("/")
# Case-insensitive comparision
if p.lower() == rf.lower():
p = ""
elif p.lower().startswith(rf.lower() + "/"):
p = p[len(rf) + 1 :]
return p# Step 6: Download files from SharePoint and land them into the Lakehouse
landed_paths = []
for item in all_items:
# Try downloadUrl (pre-assigned); else use content
download_url = item.get("@microsoft.graph.downloadUrl")
if download_url:
resp = requests.get(download_url)
resp.raise_for_status()
content = resp.content
else:
content_url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item['id']}/content"
resp = requests.get(content_url, headers=headers)
resp.raise_for_status()
content = resp.content
# Short path on Files/Landing (without no drives//root:)
relative_parent = relative_parent_from_item(item, ROOT_FOLDER)
if relative_parent:
rel = f"{TARGET_LANDING_PATH}/{relative_parent}/{item['name']}"
else:
rel = f"{TARGET_LANDING_PATH}/{item['name']}"
save_binary_to_lakehouse(rel, content, overwrite=True)
landed_paths.append(rel)
print(f"Landed {len(landed_paths)} files into Lakehouse.")https%3A%2F%2Fgithub.com%2Falisonpezzott%2Fuseful-snippets%2Fblob%2Fmain%2Ffabric_notebooks%2Fingest_sharepoint_docs.ipynb
This is really good, thanks a lot for posting...
I had to make a couple of changes to review below.
Would also be great to add a flag to set if you wanted to recurse though sub folders for not.
(a) When ROOT_FOLDER is set I had to include this import to stop error about quote: "from urllib.parse import quote"
(b) I found that when SITE_PATH was set it was ignored and it allways just used the default site. So I updated Step 4 to be:
Thanks! I had to tweak it a bit to have it find my files, but worked like a charm!