Skip to main content
added 1002 characters in body
Source Link
Ed Morton
  • 36k
  • 6
  • 25
  • 60

EDIT in response to your comment below, here's how to modify the above if you can't statically define max length for the lines from fileA (not even 100,000 chars?) and so need to figure out the max, and the lines from fileA are all lower case:

NR==FNR {
    lgth = length($0)
    str2lgth[$0] = lgth
    maxLgth = (lgth > maxLgth ? lgth : maxLgth)
    next
}
FNR==1 {
    dots = sprintf("%*s",maxLgth,"")
    gsub(/ /,".",dots)
    for ( str in str2lgth ) {
        str2dots[str] = substr(dots,1,str2lgth[str])
    }
    resSingle = "res-single"
    resLength = "res-length"
}
{
    lc = tolower($0)
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

EDIT in response to your comment below, here's how to modify the above if you can't statically define max length for the lines from fileA (not even 100,000 chars?) and so need to figure out the max, and the lines from fileA are all lower case:

NR==FNR {
    lgth = length($0)
    str2lgth[$0] = lgth
    maxLgth = (lgth > maxLgth ? lgth : maxLgth)
    next
}
FNR==1 {
    dots = sprintf("%*s",maxLgth,"")
    gsub(/ /,".",dots)
    for ( str in str2lgth ) {
        str2dots[str] = substr(dots,1,str2lgth[str])
    }
    resSingle = "res-single"
    resLength = "res-length"
}
{
    lc = tolower($0)
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}
edited body
Source Link
Ed Morton
  • 36k
  • 6
  • 25
  • 60
$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}
 

.

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

 
$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison, both again trivial tweaks if it's not what you want.

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}
 
$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison.

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

.

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45
 
$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison, both again trivial tweaks if it's not what you want.

Source Link
Ed Morton
  • 36k
  • 6
  • 25
  • 60

$ cat tst.awk
BEGIN {
    dots = sprintf("%*s",1000,"")
    gsub(/ /,".",dots)
    resSingle = "res-single"
    resLength = "res-length"
}
{ lc = tolower($0) }
NR==FNR {
    lgth = length($0)
    str2lgth[lc] = lgth
    str2dots[lc] = substr(dots,1,lgth)
    next
}
{
    for (str in str2lgth) {
        if ( s=index(lc,str) ) {
            bef = substr($0,1,s-1)
            aft = substr($0,s+str2lgth[str])
            print bef "." aft > resSingle
            print bef str2dots[str] aft > resLength
        }
    }
}

$ awk -f tst.awk fileA fileB

$ cat res-single
12.1991
ari.#!
.agnes#!
.45

.

$ cat res-length
12....1991
ari.....#!
...agnes#!
...45

The above assumes that no line in fileA will be longer than 1000 characters, if that's wrong pick a bigger number or we can add code to calculate it if necessary. It also assumes you don't care what order the lines from fileA are looked for in fileB and that you want to do a string rather than regexp comparison.