Filtering data on a dataframe, Pandas-Jupyter

Question

I have a dataframe which contains information of programmers like: country, programming languages. etc:

COUNTRY    PROGRAMMING_LANGUAGE
usa         javascript
uk          python;swift;kotlin
india       python;ruby
usa         c++;c;assembly;python
canada      java;php;golang;ruby
angola      python;c#
india       c;java
brazil      javascript;php
canada      php;sql
india       c#;java
brazil      java;javascript
russia      java;kotlin
china       javascript
usa         python;c;c++
india       ruby
australia   javascrit
india       php;java
china       swift;kotlin
russia      php;sql
brazil      firebase;kotlin
uk          sql;firebase
canada      python;c
portugal    python;php

My program should display on a dataframe:

All countries;
How many people from each country use python;

COUNTRY   KNOWS_PYTHON
 
usa         2
uk          1
india       1
angola      1
canada      1
portugal    1
russia      0
brazil      0
australia   0
china       0

Please share your opinion about my algorithm, in any possible way to improve it:

import pandas as pd
import numpy as np

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

df = pd.DataFrame({
"PROGRAMMER":np.arange(0,25),
"AGE":np.array([22,30,np.nan,25,19,27,28,26,33,18,14,np.nan,29,35,19,30,29,24,21,52,np.nan,24,np.nan,18,25],dtype=np.float16),
"COUNTRY":['uSa','Uk','india','usa','Canada','AngOla','India','braZil','canada','india','brazil','russia','china','usa','india',np.nan,'Australia','india','China','russia','brazil','uk','canada','portugal','ChiNa'],
"PROGRAMMING_LANGUAGE":['JAVASCRIPT','python;swift;kotlin','python;ruby','c++;c;assembly;python','java;php;golang;ruby','python;c#','c;java','javascript;php','php;sql','c#;java','java;javascript','java;kotlin','javascript','python;c;c++','ruby',np.nan,'javascrit','php;java','swift;kotlin','php;sql','firebase;kotlin','sql;firebase','python;C','python;php',np.nan],
"GENDER":['male','female','male','male','female','female',np.nan,'male','female','male','male','female','female',np.nan,'female','male','male','male','female','male',np.nan,'male','female','male','male'],
"LED_ZEPPELIN_FAN":['yes','YES','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes',np.nan,'yes','yes','yes','yes','yes','yes','yes','yes','yes','yes','yes'],
})

#Replacing NaN value as 'missing'

df = df.fillna("missing")

filt = (df['COUNTRY'] != "missing") & (df['PROGRAMMING_LANGUAGE'] != "missing")

table = df.loc[filt,['COUNTRY','PROGRAMMING_LANGUAGE']]

table = table.applymap(str.lower)
table

#This is just a list with all countries(without duplicates), and it will be used later

total_countries = list(set(table['COUNTRY']))

#Filter rows that contain python as programming language

filt = table['PROGRAMMING_LANGUAGE'].str.contains('python',na=False)

table_python = table.loc[filt,['COUNTRY','PROGRAMMING_LANGUAGE']]

#Getting all countries that have programmers that use python(without duplicates)

countries = table_python['COUNTRY'].value_counts().index.tolist()

#Getting the number of programmers from each country that use python(including duplicates from each country)

quantities = []
for i in range(0,len(countries)):
    quantities.append(table_python['COUNTRY'].value_counts()[i])

#Comparing the list that contains all countries, with a list of countries that use python.
#If there is a country that doesn't have programmers that use python, these will be added to final with 0 as one of the values

for i in total_countries:
    if i not in countries:
        countries.append(i)
        quantities.append(0)

table_python = pd.DataFrame({"COUNTRY":countries,"KNOWS_PYTHON":quantities})

table_python.set_index('COUNTRY',inplace=True)

table_python

Reinderien · Accepted Answer · 2021-07-30 03:43:21Z

The construction of your dataframe could be improved; your PROGRAMMER column looks like it should be the index, and np.float16 is not a good representation for what looks to be integer data.
Not a good idea to fillna with a string and then compare to that string; instead operate on the NaN values directly
Should not be doing your own list, set or loops; this problem is fully vectorizable
Your df['PROGRAMMING_LANGUAGE'] != "missing" is counterproductive; rather than filtering away one country with a missing programming language, you'd want to count it as "0"

Suggested

import numpy as np
import pandas as pd

df = pd.DataFrame({
    "PROGRAMMER": np.arange(0, 25),
    "AGE": np.array(
        [
            22, 30, np.nan, 25, 19, 27, 28, 26, 33, 18, 14, np.nan, 29, 35, 19, 30, 29, 24, 21, 52,
            np.nan, 24, np.nan, 18, 25,
        ],
        dtype=np.float16,
    ),
    "COUNTRY": [
        'uSa', 'Uk', 'india', 'usa', 'Canada', 'AngOla', 'India', 'braZil', 'canada', 'india',
        'brazil', 'russia', 'china', 'usa', 'india', np.nan, 'Australia', 'india', 'China',
        'russia', 'brazil', 'uk', 'canada', 'portugal', 'ChiNa',
    ],
    "PROGRAMMING_LANGUAGE": [
        'JAVASCRIPT', 'python;swift;kotlin', 'python;ruby', 'c++;c;assembly;python',
        'java;php;golang;ruby', 'python;c#', 'c;java', 'javascript;php', 'php;sql', 'c#;java',
        'java;javascript', 'java;kotlin', 'javascript', 'python;c;c++', 'ruby', np.nan, 'javascrit',
        'php;java', 'swift;kotlin', 'php;sql', 'firebase;kotlin', 'sql;firebase', 'python;C',
        'python;php', np.nan,
    ],
    "GENDER": [
        'male', 'female', 'male', 'male', 'female', 'female', np.nan, 'male', 'female', 'male',
        'male', 'female', 'female', np.nan, 'female', 'male', 'male', 'male', 'female', 'male',
        np.nan, 'male', 'female', 'male', 'male',
    ],
    "LED_ZEPPELIN_FAN": [
        'yes', 'YES', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
        np.nan, 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
    ],
})

# Get rid of all columns we don't care about for the final sum
df.drop(['PROGRAMMER', 'AGE', 'GENDER', 'LED_ZEPPELIN_FAN'], axis=1, inplace=True)

# Don't count countries that are undefined
df = df[df.COUNTRY.notna()]

# Standard case for both string columns
df['PROGRAMMING_LANGUAGE'] = df.PROGRAMMING_LANGUAGE.str.casefold()
df['COUNTRY'] = df.COUNTRY.str.title()

# Add a boolean column we'll use to apply a grouped sum
df['KNOWS_PYTHON'] = df.PROGRAMMING_LANGUAGE.str.contains('python', na=False)

sums = df.groupby(['COUNTRY']).sum()
sums.sort_values(by=['KNOWS_PYTHON'], ascending=False, inplace=True)

pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html — Reinderien
– Reinderien, Commented Jul 30, 2021 at 20:32

Stack Exchange Network

Filtering data on a dataframe, Pandas-Jupyter

1 Answer 1

Suggested

You must log in to answer this question.

Hot Network Questions

Filtering data on a dataframe, Pandas-Jupyter

1 Answer 1

Suggested

You must log in to answer this question.

Related

Hot Network Questions