-3

input.csv -

"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son"
"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son"
"family_guy","Brian","0c160a66-0a16-48ac-99bf-ca3610ee5a8c","son"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son"

"Family_Guy","Peter","b47f0a80-d848-4d81-a45b-7ba930e6048b","self"
"Family_Guy","Peter","ff764eac-03de-4dcf-a7d6-349e3ce4656e","self"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self"

"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son"

"family_guy","Brian","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son"
"family_guy","stewie","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son"
"family_guy","Lois","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","wife"
"family_guy","Meg","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","daughter"

"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife"
"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife"
"family_guy","Lois","674387a3-877c-4107-ae7b-524c32ee8a09","wife"
"family_guy","Lois","25da2435-5270-4b2a-bce1-e1a16e982189","wife"
"family_guy","Lois","35b19677-3c43-424d-a52a-6cf46ae7018a","wife"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter"

"family_guy","bonnie","25da2435-5270-4b2a-bce1-e1a16e982189","neighbor"
"family_guy","bonnie","0ec6beb4-5864-42eb-9d3f-16213a67d4d9","neighbor"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor"
"family_guy","chiken","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor"

"Family_Guy","Cleveland","d7c9ad94-cde1-45a4-b948-7366b51d673c","Funny1"

"Family_Guy","Cleveland","d7c82ef8-1a83-4065-a7a0-8e5a7e394486","Funny1"

"family_guy","Brian","56e3acf2-0ea2-413f-9eb5-2678b64c1591","Funny5"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5"

"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5"

Expected output. output.csv -

"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son","NO","N/A"
"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son","NO","N/A"
"family_guy","Brian","0c160a66-0a16-48ac-99bf-ca3610ee5a8c","son","YES","b47f0a80-d848-4d81-a45b-7ba930e6048b"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","b47f0a80-d848-4d81-a45b-7ba930e6048b"

"Family_Guy","Peter","b47f0a80-d848-4d81-a45b-7ba930e6048b","self","YES","065d43db-ed70-4ab0-aecc-c5ff88f2efc3"
"Family_Guy","Peter","ff764eac-03de-4dcf-a7d6-349e3ce4656e","self","YES","065d43db-ed70-4ab0-aecc-c5ff88f2efc3"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self","NO","N/A"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self","NO","N/A"

"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","25da2435-5270-4b2a-bce1-e1a16e982189"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","25da2435-5270-4b2a-bce1-e1a16e982189"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"

"family_guy","Brian","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son","NO","N/A"
"family_guy","stewie","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son","NO","N/A"
"family_guy","Lois","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","wife","NO","N/A"
"family_guy","Meg","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","daughter","NO","N/A"

"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife","NO","N/A"
"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife","NO","N/A"
"family_guy","Lois","674387a3-877c-4107-ae7b-524c32ee8a09","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Lois","25da2435-5270-4b2a-bce1-e1a16e982189","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Lois","35b19677-3c43-424d-a52a-6cf46ae7018a","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"

"family_guy","bonnie","25da2435-5270-4b2a-bce1-e1a16e982189","neighbor","YES","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1"
"family_guy","bonnie","0ec6beb4-5864-42eb-9d3f-16213a67d4d9","neighbor","YES","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"
"family_guy","chiken","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"

"Family_Guy","Cleveland","d7c9ad94-cde1-45a4-b948-7366b51d673c","Funny1","NO","N/A"

"Family_Guy","Cleveland","d7c82ef8-1a83-4065-a7a0-8e5a7e394486","Funny1","NO","N/A"

"family_guy","Brian","56e3acf2-0ea2-413f-9eb5-2678b64c1591","Funny5","YES","4cbd733d-ff77-4203-aed9-5f6da129859e"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","YES","4cbd733d-ff77-4203-aed9-5f6da129859e"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5","NO","N/A"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5","NO","N/A"

"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","NO","N/A"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","NO","N/A"

In file input.csv:

  • rows are grouped together based upon a specific criterion separated by a single blank line. Similar grouping is preferred in output.csv

    • let's call the 3rd field value as XID

    • Each group can share XID

    • Fields are constant for each row/group

    • Decision to append YES/NO and N/A in a group is based upon each group plus value in 3rd field.

      • "NO" to be appended in last field of a row of a group if that group has 3rd field value in majority (i.e. that is the value that appears most often in this specific group). And "N/A" in the field afterwards
      • "YES" to be appended in last field of a row of a group if that group has 3rd field value in minority. And "XID" from majority group in the field afterwards
    • rows in groups can be as big as 100+ but number of fields are constant

    • Actual input.csv has 5K+ rows(including blank lines) and 27 fields

I've hit my limit to awk/shell coding. Any clue would help. Files exists on RHEL7.9 and solution would run in bash console.

15
  • 1
    you really shouldn't call something "unique ID" if it's not unique – that just leads to confusion. Commented Jul 14, 2024 at 17:01
  • 1
    what's the rule if 2 or more XID's have the same number of occurrences (within a given group)? for example ... a 2-row group has 2 distinct XID values? a 3-row group has 3 distinct XID values?
    – markp-fuso
    Commented Jul 14, 2024 at 20:47
  • 1
    let me rephrase that ... you have a group with 3 rows, each row has a unique XID; when determining what goes on the end of the 3 rows ... 'NO,N/A` for all 3 rows? or does each row end with YES,which_XID?
    – markp-fuso
    Commented Jul 15, 2024 at 0:46
  • 1
    @markp-fuso If XID's are unique, then it's "NO","N/A" for all 3 rows. For that matter, n number of rows. I should have included that use case as well in output.csv. But there wasn't any for my actual input.csv so it was easy for me to miss. Sorry about that. Commented Jul 15, 2024 at 10:29
  • 1
    That's fine but then don't give the file a ".csv" suffix, just make it ".txt" or similar so it's not misleading people into thinking it's a CSV file. If you used a row of all empty fields with the same number of fields as the data rows, instead of an empty row THEN it'd be valid CSV and so then CSV-aware tools could be used on it.
    – Ed Morton
    Commented Jul 20, 2024 at 12:08

2 Answers 2

1

Here's one awk approach:

## This function takes as input an awk array whose indices are
## IDs and whose value is the number of times that ID has been seen.
## It returns the ID with the maximum value. If there are >1 IDs with the
## same value, it will return the last ID shown. 
function get_maj(list)
{
  max=0;
  maj="";
  for(id in list){
    if(list[id] > max){
      max=list[id];
      maj=id;
    }
  }
  return maj
}

## This function sets the last field. It compares two ids, and if 
## they are the same, it sets the field to "NO","N/A" and if they
## are not the same, it sets it to "YES" and the second ID given.
function last_field(id, maj){
    field = id == maj ? "\"NO\",\"N/A\"" : "\"YES\","maj;
    return field
  
}
## This is only run once, when we open the file. It sets the input and
## output field separators to comma. 
BEGIN{
  FS=",";
  OFS=",";
}

{
## If this is an empty line, so we have processed a group
  if(/^$/){
    ## get the majority ID
    maj = get_maj(group_id)
    ## iterate over the lines of this group
    for (i in lines){
      ## set the last field for this line, and print 
      last = last_field(line_id[i], maj)
      print lines[i],last
    }
    ## keep the empty lines between groups
    print "";
    ## empty the arrays; note this is not 100% portable
    delete group_id;
    delete line_id;
    delete lines
  }
  ## If this is not an empty line, process. 
  else{
    ## count the number of times we have seen this ID
    group_id[$3]++;
    ## use the current line number as a unique index for the 
    ## array line_id and set this line's ID as the corresponding value
    line_id[NR]=$3;
    ## Save this line ($0) as the value in the lines array
    lines[NR]=$0
  }
}
## The last group doesn't have an empty line after it, so 
## we need to print it explicitly
END{
  maj = get_maj(group_id)
  for (i in lines){
    last = last_field(line_id[i], maj)
    print lines[i],last
  }
}

Save the script as foo.awk and then run it with (using GNU awk here because I am not sure about the portability and you are on Linux anyway):

gawk -f /path/to/foo.awk /path/to/file

On your example, this produces:

 $ gawk -f ~/scripts/foo.awk file 
"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son","NO","N/A"
"family_guy","Brian","b47f0a80-d848-4d81-a45b-7ba930e6048b","son","NO","N/A"
"family_guy","Brian","0c160a66-0a16-48ac-99bf-ca3610ee5a8c","son","YES","b47f0a80-d848-4d81-a45b-7ba930e6048b"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","b47f0a80-d848-4d81-a45b-7ba930e6048b"

"Family_Guy","Peter","b47f0a80-d848-4d81-a45b-7ba930e6048b","self","YES","065d43db-ed70-4ab0-aecc-c5ff88f2efc3"
"Family_Guy","Peter","ff764eac-03de-4dcf-a7d6-349e3ce4656e","self","YES","065d43db-ed70-4ab0-aecc-c5ff88f2efc3"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self","NO","N/A"
"Family_Guy","Peter","065d43db-ed70-4ab0-aecc-c5ff88f2efc3","self","NO","N/A"

"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","25da2435-5270-4b2a-bce1-e1a16e982189"
"family_guy","Brian","9a219433-3e66-4a7d-b9e6-e25f127be003","son","YES","25da2435-5270-4b2a-bce1-e1a16e982189"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","NO","N/A"

"family_guy","Brian","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son","NO","N/A"
"family_guy","stewie","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","son","NO","N/A"
"family_guy","Lois","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","wife","NO","N/A"
"family_guy","Meg","d2bbf741-5d68-4ea0-8a78-9fa96f96fd20","daughter","NO","N/A"

"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife","NO","N/A"
"family_guy","Lois","9a219433-3e66-4a7d-b9e6-e25f127be003","wife","NO","N/A"
"family_guy","Lois","674387a3-877c-4107-ae7b-524c32ee8a09","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Lois","25da2435-5270-4b2a-bce1-e1a16e982189","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Lois","35b19677-3c43-424d-a52a-6cf46ae7018a","wife","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"
"family_guy","Brian","25da2435-5270-4b2a-bce1-e1a16e982189","son","YES","9a219433-3e66-4a7d-b9e6-e25f127be003"
"family_guy","Meg","9a219433-3e66-4a7d-b9e6-e25f127be003","daughter","NO","N/A"

"family_guy","bonnie","25da2435-5270-4b2a-bce1-e1a16e982189","neighbor","YES","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1"
"family_guy","bonnie","0ec6beb4-5864-42eb-9d3f-16213a67d4d9","neighbor","YES","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"
"family_guy","bonnie","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"
"family_guy","chiken","9708e5bd-41e5-4248-9b7f-7f1f3341f3e1","neighbor","NO","N/A"

"Family_Guy","Cleveland","d7c9ad94-cde1-45a4-b948-7366b51d673c","Funny1","NO","N/A"

"Family_Guy","Cleveland","d7c82ef8-1a83-4065-a7a0-8e5a7e394486","Funny1","NO","N/A"

"family_guy","Brian","56e3acf2-0ea2-413f-9eb5-2678b64c1591","Funny5","YES","4cbd733d-ff77-4203-aed9-5f6da129859e"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","YES","4cbd733d-ff77-4203-aed9-5f6da129859e"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5","NO","N/A"
"family_guy","Brian","4cbd733d-ff77-4203-aed9-5f6da129859e","Funny5","NO","N/A"

"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","NO","N/A"
"family_guy","Brian","4edcfdc8-6259-4de8-8500-5cb132814eec","Funny5","NO","N/A"

Note that in the case where a group might have >1 ID appearing the same number of times, so there isn't one majority ID but multiple ones, the script will use the last one seen.

You can even run it as a "one-liner", if you don't want your code to be readable :) :

gawk -F',' -v OFS=',' 'function get_maj(list){max=0;maj="";for(id in list){if(list[id] > max){max=list[id];maj=id}} return maj} function last_field(id, maj){field = id == maj ? "\"NO\",\"N/A\"" : "\"YES\","maj; return field}BEGIN{FS=",";OFS=","}{if(/^$/){maj = get_maj(group_id);for(i in lines){last=last_field(line_id[i],maj);print lines[i],last}print;delete group_id;delete line_id;delete lines}else{group_id[$3]++;line_id[NR]=$3;lines[NR]=$0;}}END{maj=get_maj(group_id);for(i in lines){last=last_field(line_id[i],maj);print lines[i],last}}' file
0
1

This sounds too complicated to do it reliably and quickly in awk.

You'll want to treat your "row groups" individually. A program in a mainstream programming language with a reasonable CSV reader and writer library would sound like a solution for this. Short sketch in python nearly-code.

import csv
from sys import argv, exit, stdout
from collections import Counter

if len(argv) != 2:
   print("Need exactly one file name: the file to read")
   exit(128)

all_groups = []
with open(argv[1], "r", encoding="utf-8") as infile:
  current_grp = []
  for line in infile:
    if line == "":
      all_groups.append(current_grp)
      current_grp = []
    else:
      current_grp.append(line)

# make sure "unix" is the dialect you want. It looks like it, but I can only guess.
parsed_groups = (list(csv.reader(row_group, dialect="unix")) for row_group in all_groups)
writer = csv.writer(stdout, "unix")
for group in parsed_groups:
  third_element_counts = Counter(row[2] for row in group)
  most_common_elem = third_element_counts.most_common(1)

  for row in group:
     # do your magic here, 
     extended_row = () # your magic
     writer.writerow(extended_row)
  print("") # empty line to separate groups

or so. Not tested, probably contains typos, as written from the top of my head.

Don't try to be smart in a string processing language if what you need is business logic!

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.