{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get likes from Instagram-Posts"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pickle\n",
"import pandas as pd\n",
"from IPython.display import IFrame, HTML\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from pathlib import Path\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Run Insta-Overview Notebook first !!!\n",
"insta_post_embed_links = pickle.load(open(\"Insta-Overview/instagram-post-embed-links-list.pickle\",\"rb\"))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-06-02T11:58:12.446590\n"
]
}
],
"source": [
"# Last run of Notebook\n",
"print(datetime.now().isoformat())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Instagram Posts"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading Posts from Cache\n"
]
}
],
"source": [
"insta_posts_html_path = Path(\"get-likes-for-instagram-posts/insta_posts_html_list.pickle\")\n",
"if insta_posts_html_path.exists():\n",
" print(\"Loading Posts from Cache\")\n",
" insta_posts_html = pickle.load(open(insta_posts_html_path,\"rb\"))\n",
"else:\n",
" insta_posts_html = []\n",
" print(\"Downloading Posts:\")\n",
" for i,l in enumerate(insta_post_embed_links,start=1):\n",
" print(f\"{i},\", end='')\n",
" r = requests.get(l)\n",
" if r.status_code == 200:\n",
" insta_posts_html.append(r.text)\n",
" else:\n",
" raise(Exception(\"HTTP no success, Statuscode ({r.status_code})\"))\n",
"\n",
" print(\"\\nDownloaded all Posts.\")\n",
"\n",
" pickle.dump(insta_posts_html,open(insta_posts_html_path,\"wb\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extract Likes and create DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" likes | \n",
" links | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 22 | \n",
" https://www.instagram.com/p/B_X0N2ZANYw/embed/ | \n",
"
\n",
" \n",
" 1 | \n",
" 15 | \n",
" https://www.instagram.com/p/B_X2GNege1r/embed/ | \n",
"
\n",
" \n",
" 2 | \n",
" 15 | \n",
" https://www.instagram.com/p/B_X3O7KlHnk/embed/ | \n",
"
\n",
" \n",
" 3 | \n",
" 11 | \n",
" https://www.instagram.com/p/B_aQan1FQku/embed/ | \n",
"
\n",
" \n",
" 4 | \n",
" 8 | \n",
" https://www.instagram.com/p/B_aNHkwlTTe/embed/ | \n",
"
\n",
" \n",
" 5 | \n",
" 10 | \n",
" https://www.instagram.com/p/B_aJo5OFQpw/embed/ | \n",
"
\n",
" \n",
" 6 | \n",
" 8 | \n",
" https://www.instagram.com/p/B_XlLUMpJzV/embed/ | \n",
"
\n",
" \n",
" 7 | \n",
" 9 | \n",
" https://www.instagram.com/p/B_XlN12plBk/embed/ | \n",
"
\n",
" \n",
" 8 | \n",
" 34 | \n",
" https://www.instagram.com/p/B_W-oPwIu8w/embed/ | \n",
"
\n",
" \n",
" 9 | \n",
" 33 | \n",
" https://www.instagram.com/p/B_W-nLrITcK/embed/ | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" likes links\n",
"0 22 https://www.instagram.com/p/B_X0N2ZANYw/embed/\n",
"1 15 https://www.instagram.com/p/B_X2GNege1r/embed/\n",
"2 15 https://www.instagram.com/p/B_X3O7KlHnk/embed/\n",
"3 11 https://www.instagram.com/p/B_aQan1FQku/embed/\n",
"4 8 https://www.instagram.com/p/B_aNHkwlTTe/embed/\n",
"5 10 https://www.instagram.com/p/B_aJo5OFQpw/embed/\n",
"6 8 https://www.instagram.com/p/B_XlLUMpJzV/embed/\n",
"7 9 https://www.instagram.com/p/B_XlN12plBk/embed/\n",
"8 34 https://www.instagram.com/p/B_W-oPwIu8w/embed/\n",
"9 33 https://www.instagram.com/p/B_W-nLrITcK/embed/"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"likes = []\n",
"\n",
"for insta_post in insta_posts_html:\n",
" soup = BeautifulSoup(insta_post, 'html.parser')\n",
" like_string = soup.select(\"div.SocialProof\")[0]\n",
" likes.append(int(\"\".join(like_string.get_text().split(\" \")[10].split(\",\")))) # Parsing for one-time-usage\n",
" \n",
"df = pd.DataFrame(zip(likes,insta_post_embed_links),columns=['likes','links'])\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Top-n Posts by likes"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" likes | \n",
" links | \n",
"
\n",
" \n",
" \n",
" \n",
" 46 | \n",
" 37 | \n",
" https://www.instagram.com/p/B_znqf9J2kV/embed/ | \n",
"
\n",
" \n",
" 47 | \n",
" 36 | \n",
" https://www.instagram.com/p/B_zoVx4JGxA/embed/ | \n",
"
\n",
" \n",
" 48 | \n",
" 36 | \n",
" https://www.instagram.com/p/B_zwQVLplbH/embed/ | \n",
"
\n",
" \n",
" 8 | \n",
" 34 | \n",
" https://www.instagram.com/p/B_W-oPwIu8w/embed/ | \n",
"
\n",
" \n",
" 9 | \n",
" 33 | \n",
" https://www.instagram.com/p/B_W-nLrITcK/embed/ | \n",
"
\n",
" \n",
" 10 | \n",
" 33 | \n",
" https://www.instagram.com/p/B_W-l1oIpFA/embed/ | \n",
"
\n",
" \n",
" 111 | \n",
" 33 | \n",
" https://www.instagram.com/p/CAdroyeqY__/embed/ | \n",
"
\n",
" \n",
" 0 | \n",
" 22 | \n",
" https://www.instagram.com/p/B_X0N2ZANYw/embed/ | \n",
"
\n",
" \n",
" 109 | \n",
" 21 | \n",
" https://www.instagram.com/p/CAdoHDHK2Ak/embed/ | \n",
"
\n",
" \n",
" 18 | \n",
" 20 | \n",
" https://www.instagram.com/p/B_e75mmhffZ/embed/ | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" likes links\n",
"46 37 https://www.instagram.com/p/B_znqf9J2kV/embed/\n",
"47 36 https://www.instagram.com/p/B_zoVx4JGxA/embed/\n",
"48 36 https://www.instagram.com/p/B_zwQVLplbH/embed/\n",
"8 34 https://www.instagram.com/p/B_W-oPwIu8w/embed/\n",
"9 33 https://www.instagram.com/p/B_W-nLrITcK/embed/\n",
"10 33 https://www.instagram.com/p/B_W-l1oIpFA/embed/\n",
"111 33 https://www.instagram.com/p/CAdroyeqY__/embed/\n",
"0 22 https://www.instagram.com/p/B_X0N2ZANYw/embed/\n",
"109 21 https://www.instagram.com/p/CAdoHDHK2Ak/embed/\n",
"18 20 https://www.instagram.com/p/B_e75mmhffZ/embed/"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.sort_values(by='likes',ascending=False)\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Overview of Like data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" likes | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 122.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 11.672131 | \n",
"
\n",
" \n",
" std | \n",
" 6.751116 | \n",
"
\n",
" \n",
" min | \n",
" 2.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 8.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 10.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 13.000000 | \n",
"
\n",
" \n",
" max | \n",
" 37.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" likes\n",
"count 122.000000\n",
"mean 11.672131\n",
"std 6.751116\n",
"min 2.000000\n",
"25% 8.000000\n",
"50% 10.000000\n",
"75% 13.000000\n",
"max 37.000000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Post with Lowest and Highest Number of Likes"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" Lowest number of likes
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Highest number of likes
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(HTML(\" Lowest number of likes
\"))\n",
"display(IFrame(list(df.sort_values(by='likes', ascending=True)['links'])[0], 400, 800))\n",
"\n",
"display(HTML(\" Highest number of likes
\"))\n",
"display(IFrame(list(df.sort_values(by='likes', ascending=False)['links'])[0], 400, 800))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot likes over Posts (over Time) with Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.regplot(y=df['likes'],x=df.index)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Top-n Instagram Posts"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_znqf9J2kV"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_zoVx4JGxA"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_zwQVLplbH"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_W-oPwIu8w"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_W-nLrITcK"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_W-l1oIpFA"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/CAdroyeqY__"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_X0N2ZANYw"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/CAdoHDHK2Ak"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"https://www.instagram.com/p/B_e75mmhffZ"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for l in list(df['links'])[:10]:\n",
" l_href = l.rsplit(\"/\",2)[0]\n",
" display(HTML(f\"{l_href}\"))\n",
" display(IFrame(l,400,800))\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}