This web scraper is a little odd because it had to run within a very constrained environment.
Essentially it attempts to load URLs consisting of a prefix and sequential numerical suffix in the format;
https://url/browse/JIRA-number
then prints the webpage to PDF via the default printer, and downloads a zip file from the URL found in each page’s ‘aszip’ element.
# JIRA Archive v3.0 # For archiving JIRA entries and zip attachments # Its a quick and dirty web scraper # Not the most ideal way I would usually do this but this one needed # to work with several corporate SOE and security restrictions # SYNOPSIS ------------------------------------------------- # Needed a way to download and archive all JIRA items and attachments. # The JIRA setup does not allow bulk extraction or archiving. # The solution has to operate with only the same access as a non-admin client user. # CHALLENGES ----------------------------------------------- # Corporate SOE prevents IE from auto downloading without user prompting, Chrome however allows it. # Invoke-WebRequest doesnt work (powershell is only v2, possibly other causes of this) # Resulting archive needs to be seen to maintain integrity of source without tamper (as much as possible) # (Hence print to PDF rather than save as HTML) # Urgency. # Set the target JIRA number range $from_JIRA = 1 $to_JIRA = 300 # Throtle the time between downloads $throtle = 5 # in seconds # -------------- Functions # TestFileLock function TestFileLock { param ( [parameter(Mandatory=$true)][string]$Path ) $oFile = New-Object System.IO.FileInfo $Path if ((Test-Path -Path $Path) -eq $false) { return $false } try { $oStream = $oFile.Open([System.IO.FileMode]::Open, [System.IO.FileAccess]::ReadWrite, [System.IO.FileShare]::None) if ($oStream) { $oStream.Close() } $false } catch { # file is locked by a process. return $true } } # ------------------------------------------------- #Main Loop for ($current_JIRA=$from_JIRA; $current_JIRA -lt $to_JIRA; $current_JIRA ++){ Write-Host ('Processing JIRA-' + $current_JIRA) $website = ("https://url/browse/JIRA-" + $current_JIRA) $website #using ie com object because invoke-webrequest doesnt work $ie=new-object -com internetexplorer.application $ie.Visible=$true $ie.navigate2($website) #Give the website a chance to load (note: might replace this with a page load return code check) start-sleep -seconds 10 #Print to default printer which is set to PDFCreator $ie.execWB(6,2) #Scrape the 'download all attachments' link url from the element id $aszipurl = $ie.Document.getElementById('aszip').href write-host "attachments zip url: " $aszipurl if ($aszipurl -ne $null){ #if zip file exists pass it to chrome because cant get IE to download without prompting Start-Process "chrome.exe" $aszipurl} # wait for pdf to exist and then rename it $testpath = 'C:\Users\user\Downloads\current.pdf' while ((Test-Path $testpath) -ne $true){ write-host "Waiting for pdf to exist..." Start-Sleep -seconds 5 } $newpath = 'C:\Users\user\Downloads\Named\JIRA_' + $current_JIRA + '.pdf' #make sure the pdf has finished writing first while ((TestFileLock($testpath)) -eq $true){ write-host "Waiting for pdf to be complete ..." Start-Sleep -seconds 5 } #now move the file write-host "Moving PDF" Move-Item -path $testpath -Destination $newpath #Close the ie instance $ie.quit() write-host "Sleeping " $throtle start-sleep -Seconds $throtle }