701
General Software Discussion / Re: Extract REGEX matches from multiple text files
« Last post by 4wd on September 23, 2018, 03:39 AM »Input
File 1: xml-test.xml
<?xml version="1.0" encoding="ISO8859-1" ?>
<html:products>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2018-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS</html:name>
<html:Entity>REP_XE</html:legalEntity>
<html:location>ED</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod2">
<html:referenceData>
<html:product>
<html:classificationType>PRD2</html:classificationType>
<html:productType>PRD_XE2</html:productType>
<html:productId>10005</html:productId>
<html:assignedDate>2018-12-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS2</html:name>
<html:Entity>REP_XE2</html:legalEntity>
<html:location>ED2</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod3">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2013-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS3</html:name>
<html:Entity>REP_XE3</html:legalEntity>
<html:location>ED3</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XE4</html:productType>
<html:productId>10567</html:productId>
<html:assignedDate>2010-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS4</html:name>
<html:Entity>REP_XE4</html:legalEntity>
<html:location>ED4</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod5">
<html:referenceData>
<html:product>
<html:classificationType>PRD5</html:classificationType>
<html:productType>PRD_XE5</html:productType>
<html:productId>10004890</html:productId>
<html:assignedDate>2015-05-15</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS5</html:name>
<html:Entity>REP_XE5</html:legalEntity>
<html:location>ED5</html:location>
</html:book>
</html:referenceData>
</html:prod>
</html:products>
File2: xml test2.xml
<?xml version="1.0" encoding="ISO8859-1" ?>
<html:products>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2018-03-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REFUNDS</html:name>
<html:Entity>REP_XE</html:legalEntity>
<html:location>ED</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod2">
<html:referenceData>
<html:product>
<html:classificationType>PRD2</html:classificationType>
<html:productType>PRD_XE2</html:productType>
<html:productId>10005</html:productId>
<html:assignedDate>2015-12-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS2k12</html:name>
<html:Entity>REP_XE2</html:legalEntity>
<html:location>ED57</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod3">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XER3</html:productType>
<html:productId>10014</html:productId>
<html:assignedDate>2010-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>DESTRUCTION</html:name>
<html:Entity>REP_XE3</html:legalEntity>
<html:location>ED43</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod4">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XE4</html:productType>
<html:productId>10567</html:productId>
<html:assignedDate>1999-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>WHORU</html:name>
<html:Entity>REP_XS4</html:legalEntity>
<html:location>ED4</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod5">
<html:referenceData>
<html:product>
<html:classificationType>PRD5</html:classificationType>
<html:productType>PRD_XE5</html:productType>
<html:productId>10004890</html:productId>
<html:assignedDate>2115-12-15</html:assignedDate>
</html:product>
<html:book>
<html:name>SCREW_THIS</html:name>
<html:Entity>REP_XE5</html:legalEntity>
<html:location>ED5</html:location>
</html:book>
</html:referenceData>
</html:prod>
</html:products>
<?xml version="1.0" encoding="ISO8859-1" ?>
<html:products>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2018-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS</html:name>
<html:Entity>REP_XE</html:legalEntity>
<html:location>ED</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod2">
<html:referenceData>
<html:product>
<html:classificationType>PRD2</html:classificationType>
<html:productType>PRD_XE2</html:productType>
<html:productId>10005</html:productId>
<html:assignedDate>2018-12-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS2</html:name>
<html:Entity>REP_XE2</html:legalEntity>
<html:location>ED2</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod3">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2013-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS3</html:name>
<html:Entity>REP_XE3</html:legalEntity>
<html:location>ED3</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XE4</html:productType>
<html:productId>10567</html:productId>
<html:assignedDate>2010-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS4</html:name>
<html:Entity>REP_XE4</html:legalEntity>
<html:location>ED4</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod5">
<html:referenceData>
<html:product>
<html:classificationType>PRD5</html:classificationType>
<html:productType>PRD_XE5</html:productType>
<html:productId>10004890</html:productId>
<html:assignedDate>2015-05-15</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS5</html:name>
<html:Entity>REP_XE5</html:legalEntity>
<html:location>ED5</html:location>
</html:book>
</html:referenceData>
</html:prod>
</html:products>
File2: xml test2.xml
<?xml version="1.0" encoding="ISO8859-1" ?>
<html:products>
<html:prod id="prod1">
<html:referenceData>
<html:product>
<html:classificationType>PRD</html:classificationType>
<html:productType>PRD_XE</html:productType>
<html:productId>10004</html:productId>
<html:assignedDate>2018-03-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REFUNDS</html:name>
<html:Entity>REP_XE</html:legalEntity>
<html:location>ED</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod2">
<html:referenceData>
<html:product>
<html:classificationType>PRD2</html:classificationType>
<html:productType>PRD_XE2</html:productType>
<html:productId>10005</html:productId>
<html:assignedDate>2015-12-23</html:assignedDate>
</html:product>
<html:book>
<html:name>REPAIRS2k12</html:name>
<html:Entity>REP_XE2</html:legalEntity>
<html:location>ED57</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod3">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XER3</html:productType>
<html:productId>10014</html:productId>
<html:assignedDate>2010-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>DESTRUCTION</html:name>
<html:Entity>REP_XE3</html:legalEntity>
<html:location>ED43</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod4">
<html:referenceData>
<html:product>
<html:classificationType>PRD4</html:classificationType>
<html:productType>PRD_XE4</html:productType>
<html:productId>10567</html:productId>
<html:assignedDate>1999-07-23</html:assignedDate>
</html:product>
<html:book>
<html:name>WHORU</html:name>
<html:Entity>REP_XS4</html:legalEntity>
<html:location>ED4</html:location>
</html:book>
</html:referenceData>
</html:prod>
<html:prod id="prod5">
<html:referenceData>
<html:product>
<html:classificationType>PRD5</html:classificationType>
<html:productType>PRD_XE5</html:productType>
<html:productId>10004890</html:productId>
<html:assignedDate>2115-12-15</html:assignedDate>
</html:product>
<html:book>
<html:name>SCREW_THIS</html:name>
<html:Entity>REP_XE5</html:legalEntity>
<html:location>ED5</html:location>
</html:book>
</html:referenceData>
</html:prod>
</html:products>
Output
10004890.csv
10004890.xml
Code: Text [Select]
- prod5,PRD5,PRD_XE5,10004890,2115-12-15,SCREW_THIS,REP_XE5,ED5
- prod5,PRD5,PRD_XE5,10004890,2015-05-15,REPAIRS5,REP_XE5,ED5
10004890.xml
Code: Text [Select]
- <html:prod id="prod5">
- <html:referenceData>
- <html:product>
- <html:classificationType>PRD5</html:classificationType>
- <html:productType>PRD_XE5</html:productType>
- <html:productId>10004890</html:productId>
- <html:assignedDate>2115-12-15</html:assignedDate>
- </html:product>
- <html:book>
- <html:name>SCREW_THIS</html:name>
- <html:Entity>REP_XE5</html:legalEntity>
- <html:location>ED5</html:location>
- </html:book>
- </html:referenceData>
- </html:prod>
- <html:prod id="prod5">
- <html:referenceData>
- <html:product>
- <html:classificationType>PRD5</html:classificationType>
- <html:productType>PRD_XE5</html:productType>
- <html:productId>10004890</html:productId>
- <html:assignedDate>2015-05-15</html:assignedDate>
- </html:product>
- <html:book>
- <html:name>REPAIRS5</html:name>
- <html:Entity>REP_XE5</html:legalEntity>
- <html:location>ED5</html:location>
- </html:book>
- </html:referenceData>
- </html:prod>
Code: PowerShell [Select]
- <#
- .NAME
- XML-GUI.ps1
- #>
- Add-Type -AssemblyName System.Windows.Forms
- [System.Windows.Forms.Application]::EnableVisualStyles()
- #region begin GUI{
- $Form = New-Object system.Windows.Forms.Form
- $Form.ClientSize = '246,178'
- $Form.text = "XML Mulcher"
- $Form.BackColor = "#cccccc"
- $Form.TopMost = $false
- $Form.FormBorderStyle = 'Fixed3D'
- $Form.MaximizeBox = $false
- $TextBox1 = New-Object system.Windows.Forms.TextBox
- $TextBox1.Text = ""
- $TextBox1.multiline = $false
- $TextBox1.ReadOnly = $true
- $TextBox1.Width = 185
- $TextBox1.height = 20
- $TextBox1.Location = New-Object System.Drawing.Point(16,20)
- $TextBox1.Font = 'Microsoft Sans Serif,10'
- $ListBox1 = New-Object system.Windows.Forms.ListBox
- $ListBox1.text = ""
- $ListBox1.width = 100
- $ListBox1.height = 56
- $ListBox1.location = New-Object System.Drawing.Point(16,50)
- $Label1 = New-Object system.Windows.Forms.Label
- $Label1.Text = "Processing:"
- $Label1.width = 68
- $Label1.height = 16
- $Label1.location = New-Object System.Drawing.Point(16,146)
- $Label1.Font = 'Microsoft Sans Serif,8'
- $TextBox2 = New-Object system.Windows.Forms.TextBox
- $TextBox2.multiline = $false
- $TextBox2.ReadOnly = $true
- $TextBox2.Width = 140
- $TextBox2.height = 16
- $TextBox2.Location = New-Object System.Drawing.Point(88,144)
- $TextBox2.Font = 'Microsoft Sans Serif,8'
- $Button1 = New-Object system.Windows.Forms.Button
- $Button1.text = "Go"
- $Button1.width = 60
- $Button1.height = 30
- $Button1.location = New-Object System.Drawing.Point(171,65)
- $Button1.Font = 'Microsoft Sans Serif,10'
- $Button2 = New-Object system.Windows.Forms.Button
- $Button2.text = "..."
- $Button2.width = 25
- $Button2.height = 25
- $Button2.location = New-Object System.Drawing.Point(206,19)
- $Button2.Font = 'Microsoft Sans Serif,10'
- $Label2 = New-Object system.Windows.Forms.Label
- $Label2.Text = "Output:"
- $Label2.width = 60
- $Label2.height = 16
- $Label2.location = New-Object System.Drawing.Point(16,120)
- $Label2.Font = 'Microsoft Sans Serif,8'
- $RadioButton1 = New-Object system.Windows.Forms.RadioButton
- $RadioButton1.text = "XML"
- $RadioButton1.AutoSize = $true
- $RadioButton1.width = 40
- $RadioButton1.height = 16
- $RadioButton1.location = New-Object System.Drawing.Point(88,118)
- $RadioButton1.Font = 'Microsoft Sans Serif,8'
- $RadioButton2 = New-Object system.Windows.Forms.RadioButton
- $RadioButton2.text = "CSV"
- $RadioButton2.Checked = $true
- $RadioButton2.AutoSize = $true
- $RadioButton2.width = 40
- $RadioButton2.height = 16
- $RadioButton2.location = New-Object System.Drawing.Point(148,118)
- $RadioButton2.Font = 'Microsoft Sans Serif,8'
- $Form.controls.AddRange(@($ListBox1,$TextBox1,$Button1,$Button2,$Label1,$TextBox2,$Label2,$RadioButton1,$RadioButton2))
- #region gui events {
- $Button1.Add_Click({
- if ($TextBox1.Text -ne "") {
- if ($ListBox1.SelectedItem -ne $null) {
- Clear-Host
- Set-Regex ($ListBox1.SelectedItem)
- }
- }
- })
- $Button2.Add_Click({
- $objForm = New-Object System.Windows.Forms.FolderBrowserDialog
- $objForm.Description = "Select folder containing XML"
- $objForm.SelectedPath = [System.Environment+SpecialFolder]'MyComputer'
- $objForm.ShowNewFolderButton = $false
- $result = $objForm.ShowDialog()
- if ($result -eq "OK") {
- $TextBox1.Text = $objForm.SelectedPath
- } else {
- $TextBox1.Text = ""
- }
- })
- #endregion events }
- #endregion GUI }
- #Write your logic code here
- Function Set-Regex {
- param (
- [string]$selItem
- )
- switch ($selItem) {
- "Classification" { $regex = '(____________________________)(.+?)(___)' }
- "ProductType" { $regex = '(_____________________)(.+?)(___)' }
- "ProductID" { $regex = '(___________________)(.+?)(___)' }
- }
- Mulch-Files $regex
- }
- Function Mulch-Files {
- param (
- [string]$pattern
- )
- $files = Get-ChildItem -Path ($TextBox1.Text + "\*.xml")
- for ($h = 0; $h -lt $files.Count; $h++) {
- $TextBox2.Text = $files[$h].Name
- $products = (Get-Content $files[$h] -Raw) -_____ '(____)^.*?(____________________________)'
- for ($i = 1; $i -lt $products.Count; $i += 2) {
- $products[$i] -_____ '(_________)(.+?)(___)'
- $prod = $Matches[0]
- $temp = $products[$i] -_____ $pattern
- for ($j = 0; $j -lt $temp.Count; $j++) {
- if ($RadioButton2.Checked) {
- $outFile = $Matches[0] + ".csv"
- $outText = ($prod + (((($products[$i] -replace '(<[^>]+>|\s)', ',' ) -replace '`r', '') -replace '`n', '') -replace '(,)(,)+', '$1').TrimEnd(','))
- } else {
- $outFile = $Matches[0] + ".xml"
- $outText = $products[$i]
- }
- Out-File -FilePath $outFile -InputObject $outText -Append
- }
- }
- }
- $TextBox2.Text = "Finished"
- }
- [void]$Form.ShowDialog()

Recent Posts

