Skip to main content
added 2 characters in body
Source Link
slm
  • 380.1k
  • 127
  • 793
  • 897
BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

if ((k in a) && (lc==$4) && (ld==$5)) next

if ((k in a) && (lc==$4) && (ld==$5)) next
BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

if ((k in a) && (lc==$4) && (ld==$5)) next

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  # file1 file2
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}
if ((k in a) && (lc==$4) && (ld==$5)) next
added 1188 characters in body
Source Link
slm
  • 380.1k
  • 127
  • 793
  • 897

A change in requirements

The OP mentioned in the comments below that he'd like the ultimate solution to drop any lines where the 4th and 5th columns from file1 matched the 4th and 5th columns from file2.

For example, add this line to both file1 & file2:

s2/40   40      .       S       S       90      N=2     F=5;U=4

A single line addition to the original solution can address this particular change in the requirements.

if ((k in a) && (lc==$4) && (ld==$5)) next

New Example

ex2.awk:

BEGIN{}
FNR==NR{
  k=$1" "$2
  a[k]=$4" "$5
  b[k]=$0
  c[k]=$4
  d[k]=$5
  next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  if ((k in a) && (lc==$4) && (ld==$5)) next
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

Rerunning the new awk script, ex2.awk:

$ awk -f ex2.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4

A change in requirements

The OP mentioned in the comments below that he'd like the ultimate solution to drop any lines where the 4th and 5th columns from file1 matched the 4th and 5th columns from file2.

For example, add this line to both file1 & file2:

s2/40   40      .       S       S       90      N=2     F=5;U=4

A single line addition to the original solution can address this particular change in the requirements.

if ((k in a) && (lc==$4) && (ld==$5)) next

New Example

ex2.awk:

BEGIN{}
FNR==NR{
  k=$1" "$2
  a[k]=$4" "$5
  b[k]=$0
  c[k]=$4
  d[k]=$5
  next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  if ((k in a) && (lc==$4) && (ld==$5)) next
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

Rerunning the new awk script, ex2.awk:

$ awk -f ex2.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4
added 114 characters in body
Source Link
slm
  • 380.1k
  • 127
  • 793
  • 897

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
  a[$1" "$2]=$4" k=$1" "$2
    a[k]=$4" "$5
  b[$1" "$2]=$0 b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) print&& $0"($4==$5) && "b[k](lc==$4) || (ld==$5)) print b[k]" "$0
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file2 file1 file2

Example

The sed is just to format the output for StackExchange!

$ awk -f aex.awk file2 file1 file2 | sed 's/[ ]\+/  /g'
s2/8090  2060  .  AC  TG  8630  N=2  F=5;U=4  s2/8090  2060  .  AG  AG  2097  N=2  F=5;U=4
s2/2080  1020  .  GA  T  9086  N=2  F=5;U=4  s2/2080  1020  .  GA  GA  9920  N=2  F=5;U=4
s2/9020  6010  .  CG  GT  3090  N=2  F=5;U=4  s2/9020  6010  .  G  TG  5599  N=2  F=5;U=4

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
  a[$1" "$2]=$4" "$5
  b[$1" "$2]=$0
  next
}

{ k=$1" "$2
  if (k in a) print $0"  "b[k]
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file2 file1

Example

The sed is just to format the output for StackExchange!

$ awk -f a.awk file2 file1 | sed 's/[ ]\+/  /g'
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  T  55  N=2  F=5;U=4

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file1 file2

Example

The sed is just to format the output for StackExchange!

$ awk -f ex.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4
Source Link
slm
  • 380.1k
  • 127
  • 793
  • 897
Loading