There isn’t currently a simple way of listing all the files in your Azure Data Lake gen2. You can view them using storage explorer but capturing a list isn’t straightforward.
The storage service REST API can be invoked using powershell to list files but it can be a tricky beast to work with. In order to throttle the API there is a limit of 5000 objects returned per call. If you exceed this limit then a continuation token is also returned, you then pass this token to a subsequent call to list the next 5000 files and repeat that until all files are complete.
Feel free to use this, it’s licensed under GPL.
<#
Copyright (c) 2019 Michael Cameron mcameron@dreich.net
Listing everything in your datalakestoragegen2/path/list
# Rest documentation:
# https://docs.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/list
# API:
# GET http://{accountName}.{dnsSuffix}/{filesystem}?directory={directory}&recursive={recursive}&continuation={continuation}&maxResults={maxResults}&upn={upn}&resource=filesystem&timeout={timeout}
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
#>
[CmdletBinding()]
Param(
[Parameter(Mandatory=$true,Position=1)] [string] $StorageAccountName,
[Parameter(Mandatory=$True,Position=2)] [string] $FilesystemName,
[Parameter(Mandatory=$True,Position=3)] [string] $AccessKey
)
$progressPreference = 'silentlyContinue' # suppresses the status bar for Invoke-WebRequest
Function listPaths(){
$headerDate = [System.DateTime]::UtcNow.ToString("R") # creates a string from current time using the RFC1123 pattern, e.g. 2009-06-15T13:45:30 -> Mon, 15 Jun 2009 20:45:30 GMT
$headerVersion = "2018-11-09" # use this version of the API
$method = "GET" # GET has to be used to read the list of files
$dnsSuffix = "dfs.core.windows.net"
$directory = ""
$recursive = "true"
$myToken = $global:continuationToken
$maxResults = ""
$upn = ""
$resource = "filesystem"
$timeout = "30"
# Set up an array of parameters for signing
$signatureParams =
$method, # method, e.g. GET, PUT
"", # content encoding
"", # content language
"", # content length
"", # content MD5
"", # content type
"", # date
"", # if modified since
"", # if match
"", # if no match
"", # if unmodified since
"", # range
"x-ms-date:$headerDate", # date
"x-ms-version:$headerVersion", # version
"/$StorageAccountName/$FilesystemName", # file system name
"continuation:$myToken", # continuation token
#"directory:$directory", # directory name
"maxresults:$maxResults", # maximum number of results (limit is 5000)
"recursive:$recursive", # recursion flag
"resource:$resource" # resource type, must be filesystem for storage service API
#"timeout:$timeout" # timeout in seconds
#"upn:$upn" # upn
# create a string to sign by joining the array with newlines
$stringToSign = $signatureParams -join "`n"
# create a cryptography object for signing the string using the access key provided
$hasher = New-Object System.Security.Cryptography.HMACSHA256
$sharedKey = [System.Convert]::FromBase64String($AccessKey)
$hasher.Key = $sharedKey
# sign the string
$signedSignature = [System.Convert]::ToBase64String($hasher.ComputeHash([System.Text.Encoding]::UTF8.GetBytes($stringToSign)))
$authHeader = "SharedKey ${StorageAccountName}:$signedSignature"
# set up headers for calling API
$headers = @{"x-ms-date"=$headerDate}
$headers.Add("x-ms-version",$headerVersion)
$headers.Add("Authorization",$authHeader)
# define the API address
$URI = "https://{0}.{1}/{2}?continuation={3}&maxresults={4}&recursive={5}&resource={6}" -f $StorageAccountName, $dnsSuffix, $FilesystemName, ([uri]::EscapeDataString($myToken)), $maxResults, $recursive, $resource
# call the REST API
try {
$response = Invoke-WebRequest -method $method -Uri $URI -Headers $headers
}
catch {
Write-Error $_.Exception
Write-Error $response
break
}
if ($response.headers."x-ms-continuation") {
$global:continuationToken = ($response.headers."x-ms-continuation")
} else {
$global:continuationToken = ""
}
foreach ($path in (ConvertFrom-Json $response.content).paths)
{
Write-Output (($path.name,$path.owner,$path.group, $path.permissions,$path.lastModified) -join ", ")
}
}
Do {
listPaths("")
} while ($global:continuationToken)
Note: The syntax highligther is being a bit too smart above, you may need to remove “amp;” from the URL if you copy and paste this