import json

import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import itertools

pio.templates.default = 'plotly_white'
pio.renderers.default = 'notebook'

[
 {
  "tag": {
    "name": "...",
    "num_articles": N  // (1) count from the https://dev.to/tags page
  },
  "top_articles": [
    {
      "id": 468082,
      "title": "...",
      "comments_count": 112,
      "public_reactions_count": 10893,
      "reading_time": 5,
      "tag_list": [...],
      // ... and much more
    },
   "total": N         // (2) count from the https://dev.to/t/<TAG> page
 }
]


with open('../top_articles_by_tag.json') as f:
    tags_data = json.load(f)

def sum_article_prop(entry, prop):
    return sum(article[prop] for article in entry['top_articles'])

tags = pd.DataFrame([
        [
            entry['tag']['name'], 
            entry['total'], 
            sum_article_prop(entry, 'public_reactions_count'),
            sum_article_prop(entry, 'comments_count')
        ] for entry in tags_data
    ],
    columns=['tag', 'count', 'reactions', 'comments'])

tags = tags.sort_values('count', ascending=False).reset_index()


fig = go.Figure()

for column in ['count', 'reactions', 'comments']:
    print(f'Total {column}: {tags[column].sum():,}')
    fig.add_trace(go.Scatter(x=tags.tag, y=tags[column], mode='lines+markers', name=column))

fig.update_layout(
    title='Top tags', 
    xaxis=dict(title='tag', tickmode='linear'),
    margin=dict(l=0, r=0, t=30, b=0),
    legend=dict(orientation='h', yanchor='auto', y=1.0, xanchor='auto', x=.5)
)
fig.show()
fig.write_html("plot_tags.html")

Total count: 621,345
Total reactions: 4,211,041
Total comments: 211,510


fig = go.Figure()

for column in ['count', 'reactions', 'comments']:
    fig.add_trace(go.Scatter(x=tags.tag, y=tags[column] / tags[column].sum() * 100, mode='lines+markers', name=column))

fig.update_layout(
    title='Top tags - normalized',
    xaxis=dict(title='tag', tickmode='linear'),
    yaxis=dict(title='%', ticklabelposition='inside'),
    margin=dict(l=0, r=0, t=30, b=0),
    legend=dict(orientation='h', yanchor='auto', y=1.0, xanchor='auto', x=.5)
)
fig.show()

fig.write_html("plot_tags_normalized.html")


def pp(condition):
    return '\n    ' + ', '.join(tags[condition].tag.tolist())

bold = '\033[1m'
reset = '\x1b[0m'

for high, low in itertools.pairwise([70, 30, 20, 10, 5, 2, 1, 0]):
    print(f'Between {bold}{low}K-{high}K{reset} articles:')
    condition = tags['count'].between(low * 1000, high * 1000)
    print('   ' + ', '.join(tags[condition].tag.tolist()))
    print()

Between 30K-70K articles:
   javascript, webdev, beginners

Between 20K-30K articles:
   tutorial, react, programming

Between 10K-20K articles:
   python, discuss, productivity, css, career, node, devops, codenewbie

Between 5K-10K articles:
   html, opensource, typescript, aws, showdev, github, java, testing, docker, php, security, linux, vue, ruby, git

Between 2K-5K articles:
   angular, go, database, dotnet, csharp, serverless, machinelearning, kubernetes, rails, computerscience, cloud, android, design, laravel, azure, api, algorithms, architecture, help, learning, datascience, vscode, reactnative, graphql, frontend, nextjs, flutter, watercooler, django, ios, codepen, sql, rust, todayilearned, blockchain, performance

Between 1K-2K articles:
   hacktoberfest, startup, kotlin, motivation, news, coding, challenge, mongodb, development, microservices, tailwindcss, postgres, cpp, npm, ux, gamedev, wordpress, writing, devjournal, mobile, dart, leetcode, ai, agile, firebase, management, tooling, meta, braziliandevs, mysql

Between 0K-1K articles:
   web3, community, cybersecurity, actionshackathon21, archlinux


df = tags[['tag']].copy()

for column in ['count', 'reactions', 'comments']:
    df[column] = ((df.shape[0]+1) - tags[column].rank()).astype('int32')

df['∂r'] = df['count'] - df.reactions
df['∂c'] = df['count'] - df.comments

df.style\
    .hide(axis='index')\
    .background_gradient(subset='∂r', cmap='seismic_r', vmin=-abs(df['∂r'].max()), vmax=abs(df['∂r'].max()))\
    .background_gradient(subset='∂c', cmap='seismic_r', vmin=-abs(df['∂c'].max()), vmax=abs(df['∂c'].max()))


with open('../top_articles.json') as f:
    articles_data = json.load(f)

articles = pd.DataFrame(
[
    [
         sorted(article.get('tag_list', [])),
         article['id'], 
         article['title'], 
         article['published_at_int'], 
         article['public_reactions_count'],
         article['comments_count'],
         article['reading_time']
    ] for article in articles_data
], columns=['tags','id','title','date','reactions', 'comments', 'readtime'])

articles.describe()


articles_by_tag = articles.explode('tags').groupby('tags').agg(
    count=pd.NamedAgg('id', 'count'),
    reactions=pd.NamedAgg('reactions', 'sum'),
    comments=pd.NamedAgg('comments', 'sum')
).sort_values('count', ascending=False).reset_index().rename(columns={'tags': 'tag'})

print("How the dataframe looks:")
display(articles_by_tag.head())
print("Statistics:")
articles_by_tag.describe()

How the dataframe looks:

Statistics:


n = 100
df = articles_by_tag

fig = go.Figure()

for column in ['count', 'reactions', 'comments']:
    print(f'Total {column}: {df[column].sum():,}')
    fig.add_trace(go.Scatter(x=df.tag.head(n), y=df[column].head(100), mode='lines+markers', name=column))


fig.update_layout(
    title='Top 100 tags appearing in the top 10,000 articles',
    xaxis=dict(title='tag', tickmode='linear'),
    yaxis=dict(title='%', ticklabelposition='inside'),
    margin=dict(l=0, r=0, t=30, b=0),
    legend=dict(orientation='h', yanchor='auto', y=1.0, xanchor='auto', x=.5)
)

fig.show()

fig.write_html("plot_articles.html")

Total count: 35,490
Total reactions: 11,772,508
Total comments: 736,470


n = 100
df = articles_by_tag

fig = go.Figure()

for column in ['count', 'reactions', 'comments']:
    fig.add_trace(go.Scatter(x=df.tag.head(n), y=(df[column] / df[column].sum() * 100).head(n), mode='lines+markers', name=column))

fig.update_layout(
    title='Top 100 tags appearing in the top 10,000 articles - normalized',
    xaxis=dict(title='tag', tickmode='linear'),
    yaxis=dict(title='%', ticklabelposition='inside'),
    margin=dict(l=0, r=0, t=30, b=0),
    legend=dict(orientation='h', yanchor='auto', y=1.0, xanchor='auto', x=.5)
)

fig.show()

fig.write_html("plot_articles_normalized.html")


df = articles_by_tag[['count', 'comments', 'reactions']].set_index(articles_by_tag.tag)
(df.head(10).sum() / df.sum() * 100).to_frame(name='%')


def to_series(sort_column, val_func):
    tags_order = articles_by_tag.groupby('tag').agg({sort_column: 'sum'}).reset_index()\
        .sort_values(sort_column, ascending=False)\
        .tag
    
    df = articles.copy()
    df['top_tag'] = df.tags.apply(
        lambda tags: tags_order[tags_order.isin(tags)].values[0] if len(tags) else '')

    groups = df.groupby('top_tag')
    return val_func(groups).rename(sort_column).sort_values(ascending=False)

s_counts = to_series('count', lambda group: group.id.count())
s_reactions = to_series('reactions', lambda group: group.reactions.sum())
s_comments = to_series('comments', lambda group: group.comments.sum())


for sr in [s_counts, s_reactions, s_comments]:
    fig = px.line(sr.head(80))
    fig.update_traces(mode='lines+markers')
    fig.update_layout(
        title=sr.name,
        yaxis=dict(title=None),
        xaxis=dict(title=None, tickmode='linear'), 
        showlegend=False,
        margin=dict(l=0, b=0)
    )
    fig.show()


for sr in [s_counts, s_reactions, s_comments]:
    srtop = sr.head(80)

    fig = px.line(srtop.cumsum() / sr.sum() * 100, custom_data=[srtop, srtop.cumsum().values])
                 
    fig.update_traces(dict(
        mode='lines+markers', 
        hovertemplate='tag: %{x}<br>' + sr.name + ': %{customdata[0]:,}<br>cumsum: %{customdata[1]:,} → %{y:.2f}%'
    ))
    fig.update_layout(
        title=f'{sr.name} (cumsum) - total: {sr.sum():,}',
        yaxis=dict(title=None),
        xaxis=dict(title=None, tickmode='linear'), 
        showlegend=False,
        margin=dict(l=0, b=0)
    )
    fig.show()


def tags_accounting_for(percent, serie):
    sr = (serie.cumsum() / serie.sum()) * 100
    return ', '.join(f'{k} ({v:.2f}%)' for k,v in sr[sr <= percent].items())


percent = 80 

print('Cumsum of the top tags accounting for 80% of the count/reactions/comments.\n')
print('Count: \n', tags_accounting_for(percent, s_counts))
print('\nReactions: \n', tags_accounting_for(percent, s_reactions))
print('\nComments: \n', tags_accounting_for(percent, s_comments))

Cumsum of the top tags accounting for 80% of the count/reactions/comments.

Count: 
 webdev (45.15%), javascript (62.63%), beginners (72.87%), productivity (77.15%)

Reactions: 
 webdev (49.60%), javascript (66.47%), beginners (76.68%)

Comments: 
 webdev (39.45%), javascript (53.25%), welcome (62.03%), beginners (70.71%), career (75.86%), productivity (79.62%)


pd.DataFrame({
    'count': s_counts.head(10).index, 
    'reactions': s_reactions.head(10).index, 
    'comments': s_comments.head(10).index
})


from wordcloud import WordCloud
import matplotlib.pyplot as plt

def gen_tagcloud(generate_func):
    
    tagcloud = WordCloud(
        width=1200,
        height=420,
        background_color="white",
        collocations=False,
    )
    
    generate_func(tagcloud)

    #tagcloud.to_file("tagcloud.png")

    px = 1/plt.rcParams['figure.dpi']  # pixel in inches
    plt.figure(figsize=(tagcloud.width*px, tagcloud.height*px))
    plt.imshow(tagcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()


freqs = dict(tags.apply(lambda ts: (ts.tag, ts['count']), axis=1).values)
gen_tagcloud(lambda tc: tc.generate_from_frequencies(freqs))


text = ' '.join(articles.tags.apply(lambda ts: ' '.join(ts)))
gen_tagcloud(lambda tc: tc.generate(text))

	id	date	reactions	comments	readtime
count	1.000000e+04	1.000000e+04	10000.000000	10000.00000	10000.000000
mean	4.515240e+05	1.593509e+09	328.960900	22.65850	5.981400
std	3.367066e+05	3.701980e+07	446.039356	45.30416	6.785548
min	3.560000e+02	1.461888e+09	122.000000	-25.00000	0.000000
25%	1.626262e+05	1.567225e+09	151.000000	6.00000	3.000000
50%	3.603605e+05	1.592997e+09	204.000000	13.00000	5.000000
75%	7.354550e+05	1.624631e+09	334.000000	25.00000	7.000000
max	1.250952e+06	1.668080e+09	12687.000000	1190.00000	195.000000

	tag	count	reactions	comments
0	webdev	4515	1631495	89391
1	javascript	4385	1511631	80178
2	beginners	3074	1117579	56631
3	react	1784	600528	30396
4	tutorial	1298	408933	19005

	count	reactions	comments
count	1676.000000	1.676000e+03	1676.000000
mean	21.175418	7.024169e+03	439.421241
std	191.541032	6.776828e+04	3718.690730
min	1.000000	1.220000e+02	0.000000
25%	1.000000	1.750000e+02	12.000000
50%	1.000000	3.350000e+02	29.000000
75%	4.000000	1.012250e+03	88.000000
max	4515.000000	1.631495e+06	89391.000000

Inspecting dev.to tags and popular articles¶

⮕⮕⮕ READ THE ARTICLE ON DEV.TO: dev.to is for webdevs and beginners - I have data to prove it¶

Foreword¶

Getting the data¶

Top tags¶

Number of articles per tag¶

Top articles¶

Code¶

Imports¶

Tags¶

Top Articles¶

Most popular tags in the top 10,000 articles¶

Most successful tags - counting articles only once¶

Tag clouds¶

Conclusion¶

tag	count	reactions	comments	∂r	∂c
javascript	1	2	2	-1	-1
webdev	2	1	1	1	1
beginners	3	3	5	0	-2
tutorial	4	9	13	-5	-9
react	5	5	7	0	-2
programming	6	10	15	-4	-9
python	7	15	28	-8	-21
discuss	8	32	3	-24	5
productivity	9	6	8	3	1
css	10	7	10	3	0
career	11	4	6	7	5
node	12	11	17	1	-5
devops	13	16	38	-3	-25
codenewbie	14	13	18	1	-4
html	15	8	11	7	4
opensource	16	17	9	-1	7
typescript	17	20	23	-3	-6
aws	18	35	57	-17	-39
showdev	19	21	14	-2	5
github	20	12	21	8	-1
java	21	31	39	-10	-18
testing	22	46	66	-24	-44
docker	23	22	46	1	-23
php	24	55	35	-31	-11
security	25	25	32	0	-7
linux	26	29	19	-3	7
vue	27	24	24	3	3
ruby	28	65	50	-37	-22
git	29	14	20	15	9
angular	30	28	40	2	-10
go	31	47	53	-16	-22
database	32	52	68	-20	-36
dotnet	33	63	65	-30	-32
csharp	34	72	63	-38	-29
serverless	35	58	64	-23	-29
machinelearning	36	60	92	-24	-56
kubernetes	37	43	91	-6	-54
rails	38	70	52	-32	-14
computerscience	39	18	30	21	9
cloud	40	54	84	-14	-44

	%
count	57.207664
comments	52.693117
reactions	60.887442