Discussion forum for all Windows batch related topics.
Moderator: DosItHelp
-
plasma33
- Posts: 22
- Joined: 26 Jul 2017 21:18
#1
Post
by plasma33 » 09 Aug 2017 00:23
Hello guys,
I would like to extract substrings from raw strings, please. Input text file will contain the raw strings (I have around 200,000 lines of that). Ouput text file will only have the extracted substrings. Please see below for more details:
Sample input text file:Code: Select all
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------M-I---NI-I--PGANB-------------------
----------------------------------GP----------GAGPM-PMMI---Q
------------------NR----------------------------ML---M------
--CMMIG-----I-H---------------RD------------RDIGNKMFN--GIPIG
NBG--A-----GPGPNI--ML----NK-NL------------------------------
----------------------NI----MERB---GA--GPND----GAGP---------
Sample output text file:Code: Select all
M
I
NI
I
PGANB
GP
GAGPM
PMMI
Q
NR
ML
M
CMMIG
I
H
RD
RDIGNKMFN
GIPIGNBG
A
GPGPNI
ML
NK
NL
NI
MERB
GA
GPND
GAGP
Please be aware that most of the time some substrings will be continued on to next line. Please see the end of line 8 and the beginning of line 9 in the sample input text file. Line 18 in the sample output text file is the substring that was partially in line 8 and line 9.
Thanks, guys.
Plasma33
-
elzooilogico
- Posts: 128
- Joined: 23 May 2016 15:39
- Location: Spain
#2
Post
by elzooilogico » 09 Aug 2017 04:02
This should be enhanced (is slow) but does the work.
Code: Select all
@echo off
SetLocal EnableExtensions EnableDelayedExpansion
set "input=input.txt"
set "output=output.txt"
rem needed to show progress
set/a cnt=0
for /F "tokens=1,2 delims=#" %%a in ('"prompt #$H#$E & echo on & for %%b in (1) do rem"') do set "DEL=%%a"
for /L %%i in (1,1,39) do set "DEL_LINE=!DEL_LINE!%DEL%%DEL%"
>"%output%" (
for /F "delims=" %%a in (%input%) do (
set/a index=1, cnt+=1
rem show progress
<NUL set/P=!DEL_LINE!Processing line !cnt!>CON
call :split %%a
rem if there is a remainder check begin of line
if defined last (
if "!last!!first!" neq "!last!" (
set /a index=2
echo !last!!first!
) else (
echo !last!
)
)
rem remember last string
for %%i in (!i!) do set "last=!x%%i!"
rem write data except last string
set/a i-=1
for /L %%i in (!index!,1,!i!) do if defined x%%i echo !x%%i!
)
rem if last line ends with a string, write it
if defined last echo !last!
) & rem >"%output%"
EndLocal
exit/B
rem split string into substrings based on delimiter
rem http://www.dostips.com/forum/viewtopic.php?f=3&t=6429#p41035
:split
set "line=%~1"
set/a i=1
set "x!i!=%line:-=" & set /A i+=1 & set "x!i!=%"
set "first=!x1!"
exit/B
EDIT, now lets do it faster with a
batch and c# hybrid.
NOTE The .Net framework must be installed on your system.
Code: Select all
//>nul 2>nul||@goto :batch_code
/*
:batch_code
@echo off
setlocal
rem place desired exe name
set "theExeFile=myParser2.exe"
rem input output files. place full path plus filename i.e)
rem set "inputFile=C:\Users\Michael\Desktop\input.txt"
rem set "outputFile=C:\Users\Michael\Desktop\output.txt"
set "inputFile=.\input.txt"
set "outputFile=.\output.txt"
if not exist "%theExeFile%" call :build_the_exe || exit/B
%theExeFile% "%inputFile%" "%outputFile%"
if %errorlevel% NEQ 0 echo Error code %errorlevel%
endlocal
exit /b 0
:build_the_exe
:: find csc.exe
set "frm=%SystemRoot%\Microsoft.NET\Framework\"
for /f "tokens=* delims=" %%v in ('dir /b /a:d /o:-n "%SystemRoot%\Microsoft.NET\Framework\v*"') do (
set netver=%%v
goto :break_loop
)
:break_loop
set "csc=%frm%%netver%\csc.exe"
:: csc not found
if "%csc%" == "\csc.exe" echo/&echo/Warning: Net Framework Not Found&exit/B 1
::csc found
call %csc% /nologo /out:"%theExeFile%" "%~dpsfnx0"
exit/B 0
*/
//begin c# code
using System;
using System.IO;
using System.Linq;
namespace ElZooilogico
{
public class Parser
{
private static void parse(string inputFile, string outputFile)
{
string[] words;
string line, last=String.Empty;
int index = 0;
long lineNum = 0;
long lineCount = File.ReadLines(inputFile).Count();
Console.WriteLine("\nInput file: {0}\nOutput file: {1}\n", inputFile, outputFile);
using ( System.IO.StreamReader input = new System.IO.StreamReader(inputFile) )
{
using ( System.IO.StreamWriter output = new System.IO.StreamWriter(outputFile) )
{
while((line = input.ReadLine()) != null)
{
lineNum++;
Console.Write("\rProcessing line {0} of {1}, {2}%", lineNum, lineCount, (lineNum*100)/lineCount);
//words = line.Split(new[] { '-' },StringSplitOptions.RemoveEmptyEntries);
words = line.Split('-');
index = 0;
if ( last + words[0] != last ) {
index = 1;
output.WriteLine(last + words[0]);
} else
if ( last != String.Empty ) output.WriteLine(last);
for (int i = index; i < words.Length - 1; i++)
if ( words[i] != String.Empty ) output.WriteLine(words[i]);
last=words[words.Length-1];
}
if ( last != String.Empty ) output.WriteLine(last);
} // output file is closed here
} // input file is closed here
Console.WriteLine("\n\nDone!");
}
public static int Main(string[] args)
{
try {
if ( args.Length != 2 )
return 9;
if ( !File.Exists(args[0]) )
return 1;
parse(args[0], args[1]);
} catch (Exception e){ System.Windows.Forms.MessageBox.Show(e.Message); return 3; }
return 0;
}
} // class Parser
} // namespace ElZooilogico
-
Aacini
- Expert
- Posts: 1914
- Joined: 06 Dec 2011 22:15
- Location: México City, México
-
Contact:
#3
Post
by Aacini » 09 Aug 2017 07:35
Code: Select all
@echo off
setlocal EnableDelayedExpansion
set "last="
(
for /F "delims=" %%a in (input.txt) do (
set "line=%%a"
if defined last (
set /P "=!last!" < NUL & set "last="
if "!line:~0,1!" equ "-" echo/
)
set "tail="
if "!line:~-1!" neq "-" set "tail=1"
set "line=!line:-= !"
for %%b in (!line!) do (
if defined tail (
if defined last echo !last!
set "last=%%b"
) else (
echo %%b
)
)
)
if defined last echo !last!
) > output.txt
If this program is too slow, the same method could be translated to JScript...
Antonio
-
plasma33
- Posts: 22
- Joined: 26 Jul 2017 21:18
#4
Post
by plasma33 » 09 Aug 2017 16:12
Hello guys,
Codes from both of you work perfectly. @elzooilogico, your hybrid code is amazing. Super fast!!
Thanks, guys.
Plasma33
-
DosItHelp
- Expert
- Posts: 239
- Joined: 18 Feb 2006 19:54
#5
Post
by DosItHelp » 21 Aug 2017 01:44
Aacini's solution shortened:
Code: Select all
@echo off
setlocal EnableDelayedExpansion
set "last="
(
for /F "delims=" %%a in (input.txt) do (
set "line=!last!%%a"
set "last="
set "line=!line:-= !"
for %%b in (!line!) do (
if defined last echo !last!
set "last=%%b"
)
)
if defined last echo !last!
) > output.txt