PowerShell: JIRA Archiver v3

This web scraper is a little odd because it had to run within a very constrained environment.

Essentially it attempts to load URLs consisting of a prefix and sequential numerical suffix in the format;

https://url/browse/JIRA-number

then prints the webpage to PDF via the default printer, and downloads a zip file from the URL found in each page’s ‘aszip’ element.

 

# JIRA Archive v3.0
# For archiving JIRA entries and zip attachments

# Its a quick and dirty web scraper 
# Not the most ideal way I would usually do this but this one needed 
#  to work with several corporate SOE and security restrictions

# SYNOPSIS -------------------------------------------------

# 	Needed a way to download and archive all JIRA items and attachments.
#	The JIRA setup does not allow bulk extraction or archiving.
#       The solution has to operate with only the same access as a non-admin client user.

# CHALLENGES -----------------------------------------------

#	Corporate SOE prevents IE from auto downloading without user prompting, Chrome however allows it.
#	Invoke-WebRequest doesnt work (powershell is only v2, possibly other causes of this)
#	Resulting archive needs to be seen to maintain integrity of source without tamper (as much as possible)
#	(Hence print to PDF rather than save as HTML)
#   Urgency.  
    
# Set the target JIRA number range
    $from_JIRA = 1
    $to_JIRA = 300

# Throtle the time between downloads 

    $throtle = 5 # in seconds

# -------------- Functions

            # TestFileLock
            function TestFileLock {
              param (
                [parameter(Mandatory=$true)][string]$Path
              )

              $oFile = New-Object System.IO.FileInfo $Path

              if ((Test-Path -Path $Path) -eq $false) {
                return $false
              }

              try {
                $oStream = $oFile.Open([System.IO.FileMode]::Open, [System.IO.FileAccess]::ReadWrite, [System.IO.FileShare]::None)

                if ($oStream) {
                  $oStream.Close()
                }
                $false
              } catch {
                # file is locked by a process.
                return $true
              }
            }

# -------------------------------------------------


#Main Loop

for ($current_JIRA=$from_JIRA; $current_JIRA -lt $to_JIRA; $current_JIRA ++){

    Write-Host ('Processing JIRA-' + $current_JIRA)

    $website = ("https://url/browse/JIRA-" + $current_JIRA)
    $website
    
        #using ie com object because invoke-webrequest doesnt work 
           
        $ie=new-object -com internetexplorer.application 
        $ie.Visible=$true
        $ie.navigate2($website) 
        #Give the website a chance to load (note: might replace this with a page load return code check) 
        start-sleep -seconds 10
        #Print to default printer which is set to PDFCreator
        $ie.execWB(6,2) 
        
        #Scrape the 'download all attachments' link url from the element id
        $aszipurl = $ie.Document.getElementById('aszip').href
        write-host "attachments zip url: " $aszipurl
        
        if ($aszipurl -ne $null){
        #if zip file exists pass it to chrome because cant get IE to download without prompting
        Start-Process "chrome.exe" $aszipurl}

 
     # wait for pdf to exist and then rename it 
     
     $testpath = 'C:\Users\user\Downloads\current.pdf'

      while ((Test-Path $testpath) -ne $true){
         write-host "Waiting for pdf to exist..."
         Start-Sleep -seconds 5
     }

     $newpath = 'C:\Users\user\Downloads\Named\JIRA_' + $current_JIRA + '.pdf'


     #make sure the pdf has finished writing first
     while ((TestFileLock($testpath)) -eq $true){

        write-host "Waiting for pdf to be complete ..."
         Start-Sleep -seconds 5
     }

     #now move the file
     write-host "Moving PDF"
     Move-Item -path $testpath -Destination $newpath

     #Close the ie instance
     $ie.quit()

     write-host "Sleeping " $throtle
     start-sleep -Seconds $throtle

     }