from lec_utils import *

df = pd.read_csv('data/social_media_entertainment.csv')
df

print('Original shape: ', df.shape)
df = df.set_index('User ID')
print('New shape: ', df.shape)

Original shape:  (50145, 40)
New shape:  (50145, 39)

s = df['Preferred Entertainment Platform']
s

User ID
3              Spotify
8         Amazon Prime
24             Spotify
              ...     
299988         Netflix
299990         Netflix
299999         Spotify
Name: Preferred Entertainment Platform, Length: 50145, dtype: object

# What is the distribution of Preferred Entertainment Platforms?
df['Preferred Entertainment Platform'].value_counts(normalize=True)

Preferred Entertainment Platform
Spotify         0.25
YouTube         0.25
Netflix         0.25
Amazon Prime    0.25
Name: proportion, dtype: float64

# Which entertainment platform is used the most?
df['Preferred Entertainment Platform'].value_counts().idxmax()

'Spotify'

# How many users prefer the leading entertainment platform? 
df['Preferred Entertainment Platform'].value_counts().max()

12625

# How many prefer the least popular entertainment platform?
df['Preferred Entertainment Platform'].value_counts().min()

12485

# Select rows 1-6 in our DataFrame along with columns 3-8
df.iloc[1:7, 3:9]

# Show the Sleep Quality for user with id 3
df.loc[3, 'Sleep Quality (scale 1-10)']

5

s = df['Daily Social Media Time (hrs)']
s

User ID
3         6.78
8         4.20
24        0.78
          ... 
299988    3.16
299990    4.37
299999    6.47
Name: Daily Social Media Time (hrs), Length: 50145, dtype: float64

type(s)

pandas.core.series.Series

# What is the daily social media time (in hours) of user with ID 3?
s.loc[3]

6.78

# What is the daily social media time (in hours) of the user in row 3 (zero-indexed)?
s.iloc[3]

7.72

s.head(4)

User ID
3     6.78
8     4.20
24    0.78
25    7.72
Name: Daily Social Media Time (hrs), dtype: float64

# What is the highest social media usage time?
s.sort_values(ascending=False).iloc[0]

8.0

# The first argument is the row label, i.e. the index value.
#        ↓
df.loc[3, 'Daily Social Media Time (hrs)']
#                  ↑
# The second argument is the column label.

6.78

df.loc[[3, 8, 24, 25], ['Primary Platform', 'Daily Social Media Time (hrs)']]

df.iloc[:6, :8]

# Find the age of user with highest daily messaging time.
# Our DataFrame has a column called 'Daily Messaging Time (hrs)' and 'Age'.
df.sort_values('Daily Messaging Time (hrs)', ascending=False)['Age'].iloc[0]

31

df['Primary Platform'] == 'TikTok'

User ID
3         False
8         False
24        False
          ...  
299988    False
299990    False
299999    False
Name: Primary Platform, Length: 50145, dtype: bool

df.loc[df['Primary Platform'] == 'TikTok']

df.loc[(df['Primary Platform'] == 'TikTok') & (df['Daily Social Media Time (hrs)'] > 3.0)]

# How many users use TikTok as their primary platform?
df.loc[(df['Primary Platform'] == 'TikTok')].shape[0]

10127

# Among users who use TikTok as their primary platform, what is the highest Daily Social Media Time?
df.loc[df['Primary Platform'] == 'TikTok', 'Daily Social Media Time (hrs)'].sort_values(ascending=False).iloc[0]

8.0

# Which gender prefers to use Spotify the most?
# Our DataFrame has a column called 'Preferred Entertainment Platform'.
df.loc[df['Preferred Entertainment Platform'] == 'Spotify', 'Gender'].value_counts().idxmax()

'Female'

	User ID	Age	Gender	Country	...	Digital Well-being Awareness	Sleep Quality (scale 1-10)	Social Isolation Feeling (scale 1-10)	Monthly Expenditure on Entertainment (USD)
0	3	51	Female	USA	...	High	5	3	71.72
1	8	58	Female	USA	...	Low	9	1	9.65
2	24	32	Female	USA	...	Low	6	1	280.82
...	...	...	...	...	...	...	...	...	...
50142	299988	48	Other	USA	...	High	5	3	350.45
50143	299990	24	Other	USA	...	High	7	1	108.41
50144	299999	15	Male	USA	...	High	7	5	432.00

	Daily Social Media Time (hrs)	Daily Entertainment Time (hrs)	Social Media Platforms Used	Primary Platform	Daily Messaging Time (hrs)	Daily Video Content Time (hrs)
User ID
8	4.20	2.77	4	Facebook	3.54	0.69
24	0.78	0.55	2	Facebook	0.57	2.65
25	7.72	5.83	4	YouTube	1.37	5.21
32	0.62	7.94	1	YouTube	1.85	6.92
45	7.45	9.85	5	Facebook	4.14	2.23
47	0.92	2.93	4	TikTok	2.80	4.54

	Age	Gender	Country	Daily Social Media Time (hrs)	Daily Entertainment Time (hrs)	Social Media Platforms Used	Primary Platform	Daily Messaging Time (hrs)
User ID
3	51	Female	USA	6.78	1.77	4	Facebook	2.09
8	58	Female	USA	4.20	2.77	4	Facebook	3.54
24	32	Female	USA	0.78	0.55	2	Facebook	0.57
25	36	Female	USA	7.72	5.83	4	YouTube	1.37
32	21	Male	USA	0.62	7.94	1	YouTube	1.85
45	26	Female	USA	7.45	9.85	5	Facebook	4.14

	Age	Gender	Country	Daily Social Media Time (hrs)	...	Digital Well-being Awareness	Sleep Quality (scale 1-10)	Social Isolation Feeling (scale 1-10)	Monthly Expenditure on Entertainment (USD)
User ID
47	29	Male	USA	0.92	...	Low	5	9	452.78
48	40	Other	USA	7.27	...	Low	6	6	412.09
52	53	Male	USA	6.90	...	High	5	7	188.51
...	...	...	...	...	...	...	...	...	...
299906	37	Other	USA	5.77	...	High	2	8	481.62
299934	52	Male	USA	2.19	...	High	6	1	42.33
299980	35	Female	USA	5.44	...	High	5	7	324.45

	Age	Gender	Country	Daily Social Media Time (hrs)	...	Digital Well-being Awareness	Sleep Quality (scale 1-10)	Social Isolation Feeling (scale 1-10)	Monthly Expenditure on Entertainment (USD)
User ID
48	40	Other	USA	7.27	...	Low	6	6	412.09
52	53	Male	USA	6.90	...	High	5	7	188.51
55	44	Female	USA	3.99	...	Low	9	8	274.86
...	...	...	...	...	...	...	...	...	...
299844	56	Male	USA	4.37	...	High	2	1	359.47
299906	37	Other	USA	5.77	...	High	2	8	481.62
299980	35	Female	USA	5.44	...	High	5	7	324.45

Discussion Slides: DataFrames and Querying

Agenda 📆¶

Today's Dataset 📲¶

Choosing an index¶

`value_counts`¶

`loc` vs. `iloc`¶

Using `loc` and `iloc` on a DataFrame¶

Querying 🔍¶

Discussion Slides: DataFrames and Querying

Agenda 📆¶

Today's Dataset 📲¶

Choosing an index¶

value_counts¶

loc vs. iloc¶

Example: Daily Social Media Usage 📱¶

Using loc and iloc on a DataFrame¶

Querying 🔍¶

`value_counts`¶

`loc` vs. `iloc`¶

Using `loc` and `iloc` on a DataFrame¶