Revisions to Comparing Files based on 5 fields using Awk and Bash

added 2 characters in body

Source Link

edited Aug 21, 2013 at 21:41

380.1k
127
793
897

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

if ((k in a) && (lc==$4) && (ld==$5)) next

if ((k in a) && (lc==$4) && (ld==$5)) next

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

if ((k in a) && (lc==$4) && (ld==$5)) next

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  # file1 file2
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

if ((k in a) && (lc==$4) && (ld==$5)) next

added 1188 characters in body

Source Link

edited Aug 19, 2013 at 6:12

slm ♦

380.1k
127
793
897

A change in requirements

The OP mentioned in the comments below that he'd like the ultimate solution to drop any lines where the 4th and 5th columns from file1 matched the 4th and 5th columns from file2.

For example, add this line to both file1 & file2:

s2/40   40      .       S       S       90      N=2     F=5;U=4

A single line addition to the original solution can address this particular change in the requirements.

if ((k in a) && (lc==$4) && (ld==$5)) next

New Example

ex2.awk:

BEGIN{}
FNR==NR{
  k=$1" "$2
  a[k]=$4" "$5
  b[k]=$0
  c[k]=$4
  d[k]=$5
  next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  if ((k in a) && (lc==$4) && (ld==$5)) next
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

Rerunning the new awk script, ex2.awk:

$ awk -f ex2.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4

A change in requirements

The OP mentioned in the comments below that he'd like the ultimate solution to drop any lines where the 4th and 5th columns from file1 matched the 4th and 5th columns from file2.

For example, add this line to both file1 & file2:

s2/40   40      .       S       S       90      N=2     F=5;U=4

A single line addition to the original solution can address this particular change in the requirements.

if ((k in a) && (lc==$4) && (ld==$5)) next

New Example

ex2.awk:

BEGIN{}
FNR==NR{
  k=$1" "$2
  a[k]=$4" "$5
  b[k]=$0
  c[k]=$4
  d[k]=$5
  next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
  if ((k in a) && (lc==$4) && (ld==$5)) next
  if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

Rerunning the new awk script, ex2.awk:

$ awk -f ex2.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4

added 114 characters in body

Source Link

edited Aug 14, 2013 at 19:10

slm ♦

380.1k
127
793
897

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
  a[$1" "$2]=$4" k=$1" "$2
    a[k]=$4" "$5
  b[$1" "$2]=$0 b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) print&& $0"($4==$5) && "b[k](lc==$4) || (ld==$5)) print b[k]" "$0
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file2 file1 file2

Example

The sed is just to format the output for StackExchange!

$ awk -f aex.awk file2 file1 file2 | sed 's/[ ]\+/  /g'
s2/8090  2060  .  AC  TG  8630  N=2  F=5;U=4  s2/8090  2060  .  AG  AG  2097  N=2  F=5;U=4
s2/2080  1020  .  GA  T  9086  N=2  F=5;U=4  s2/2080  1020  .  GA  GA  9920  N=2  F=5;U=4
s2/9020  6010  .  CG  GT  3090  N=2  F=5;U=4  s2/9020  6010  .  G  TG  5599  N=2  F=5;U=4

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
  a[$1" "$2]=$4" "$5
  b[$1" "$2]=$0
  next
}

{ k=$1" "$2
  if (k in a) print $0"  "b[k]
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file2 file1

Example

The sed is just to format the output for StackExchange!

$ awk -f a.awk file2 file1 | sed 's/[ ]\+/  /g'
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  T  55  N=2  F=5;U=4

Here's a solution using just awk. Put the below code in a file called ex.awk:

BEGIN{}
FNR==NR{
    k=$1" "$2
    a[k]=$4" "$5
    b[k]=$0
    c[k]=$4
    d[k]=$5
    next
}

{ k=$1" "$2
  lc=c[k]
  ld=d[k]
    # file1 file2
    if ((k in a) && ($4==$5) && (lc==$4) || (ld==$5)) print b[k]" "$0
}

And then run it like this with the above 2 files:

$ awk -f ex.awk file1 file2

Example

The sed is just to format the output for StackExchange!

$ awk -f ex.awk file1 file2 | sed 's/[ ]\+/  /g'
s2/90  60  .  C  G  30  N=2  F=5;U=4  s2/90  60  .  G  G  97  N=2  F=5;U=4
s2/80  20  .  A  T  86  N=2  F=5;U=4  s2/80  20  .  A  A  20  N=2  F=5;U=4
s2/20  10  .  G  T  90  N=2  F=5;U=4  s2/20  10  .  G  G  99  N=2  F=5;U=4

Source Link

answered Aug 13, 2013 at 19:54

slm ♦

380.1k
127
793
897

Loading

Stack Exchange Network

Return to Answer

A change in requirements

New Example

A change in requirements

New Example

Example

Example

Example