Revisions to For each line in file A replace all matching lines in file B with a pattern

added 1002 characters in body

Source Link

edited Jul 4, 2020 at 11:44

36k
6
25
60

EDIT in response to your comment below, here's how to modify the above if you can't statically define max length for the lines from fileA (not even 100,000 chars?) and so need to figure out the max, and the lines from fileA are all lower case:

NR==FNR {
    lgth = length($0)
    str2lgth[$0] = lgth
    maxLgth = (lgth > maxLgth ? lgth : maxLgth)
    next
}
FNR==1 {
    dots = sprintf("%*s",maxLgth,"")
    gsub(/ /,".",dots)
    for ( str in str2lgth ) {
        str2dots[str] = substr(dots,1,str2lgth[str])
    }
    resSingle = "res-single"
    resLength = "res-length"
}
{
    lc = tolower($0)
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

EDIT in response to your comment below, here's how to modify the above if you can't statically define max length for the lines from fileA (not even 100,000 chars?) and so need to figure out the max, and the lines from fileA are all lower case:

NR==FNR {
    lgth = length($0)
    str2lgth[$0] = lgth
    maxLgth = (lgth > maxLgth ? lgth : maxLgth)
    next
}
FNR==1 {
    dots = sprintf("%*s",maxLgth,"")
    gsub(/ /,".",dots)
    for ( str in str2lgth ) {
        str2dots[str] = substr(dots,1,str2lgth[str])
    }
    resSingle = "res-single"
    resLength = "res-length"
}
{
    lc = tolower($0)
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

edited body

Source Link

edited Jul 1, 2020 at 15:37

Ed Morton

36k
6
25
60

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

.

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

 
$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison, both again trivial tweaks if it's not what you want.

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}
 
$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison.

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

.

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45
 
$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison, both again trivial tweaks if it's not what you want.

Source Link

answered Jul 1, 2020 at 15:29

Ed Morton

36k
6
25
60

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison.

Stack Exchange Network

Return to Answer