Web Scraping Projects

Python Requests

About the project

I have worked on a few projects related to web scraping and luckily many of them that had open APIs made it easy for me to get the data I want in a quick and efficient way. On the flip side, I have also done web scraping through a selenium webdriver to gather information and to fill out online forms.

Some technologies for web scraping that I have are:

Requests
Scrapy
Selenium
Beautiful Soup
Pandas

Most of my web scraping experience is using Python reuqests, I will share a couple of those projects below.

1. Review Scraping - whop.com

For this project, I wanted to get the reviews off of whop.com. For this I was able to get all the neccessary info from monitoring the network traffic on a request to make a Python script using requests.

The site uses GraphQL for the post request query and with a sample query in the network payload I was able to easily get what I wanted. Once I got the data from the json response I was then able to pull what I wanted then send the reviews as embeds to a Discord channel.

import requests
import json
import csv
from datetime import datetime
import discord
from discord import Webhook
import asyncio
import aiohttp

async def main():
    url = 'https://whop.com/api/graphql/fetchMarketplacePageReviews/'

    variables = {
        "id": "pge_bSKWtJeggxG147",
        "after": "MA==",
        "stars": None
    }

    # Define headers with referer
    headers = {
        'Referer': 'https://whop.com/marketplace/notify/'
    }

    query_count = '''
        query fetchMarketplacePageReviews($id: ID!, $after: String, $stars: Int) {
            publicPage(id: $id) {
                ...PublicMarketplacePageReviewsData
            }
        }
        
        fragment PublicMarketplacePageReviewsData on PublicPage {
            reviewsAverage
            reviews(first: 0, after: $after, stars: $stars) {
                totalCount
            }
        }
    '''

    payload = {'query': query_count, 'variables': variables, 'operationName': 'fetchMarketplacePageReviews'}

    # Make the POST request with headers
    response_count = requests.post(url, json=payload, headers=headers)

    # Check if the request was successful (status code 200)
    if response_count.status_code == 200:
        # Parse the JSON response to get the total count of reviews
        total_count_data = response_count.json()
        total_count = total_count_data['data']['publicPage']['reviews']['totalCount']
        avg_review = total_count_data['data']['publicPage']['reviewsAverage']
    else:
        print('Failed to fetch total count of reviews with status code:', response_count.status_code)

    total_count = 3

    # Define the GraphQL query and variables
    query = '''
        query fetchMarketplacePageReviews($id: ID!, $after: String, $stars: Int) {
            publicPage(id: $id) {
                ...PublicMarketplacePageReviewsData
            }
        }
        
        fragment PublicMarketplacePageReviewsData on PublicPage {
            reviews(first: '''+ str(total_count) +''', after: $after, stars: $stars) {
                nodes {
                    ...PublicMarketplacePageReview
                }
            }
        }
        
        fragment PublicMarketplacePageReview on Review {
            user {
                header
                profilePic32: profilePicSrcset(style: s32, allowAnimation: true) {
                    original
                    double
                    isVideo
                }
            }
            joinedAt
            createdAt
            stars
            description
        }
    '''

    payload = {'query': query, 'variables': variables, 'operationName': 'fetchMarketplacePageReviews'}

    # Make the POST request with headers
    response = requests.post(url, json=payload, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        reviews = data['data']['publicPage']['reviews']['nodes']

        WEBHOOK_URL = "https://discord.com/api/webhooks/1210103050518274058/ghh28fSBWpsEcz25oZzsWhdtdqNazYV7d4iyr8DVoEiFwPbfJBeQlnVkR0kxJqG16Jzc"

        async with aiohttp.ClientSession() as session:

            webhook = Webhook.from_url(WEBHOOK_URL, session=session)
        
            # Write reviews data to CSV file
            for review in reversed(reviews):
                name = review['user']['header']
                created_at = datetime.utcfromtimestamp(review['createdAt']).strftime('%Y-%m-%d %H:%M:%S')
                joined = datetime.utcfromtimestamp(review['joinedAt']).strftime('%Y-%m-%d')
                rating = "⭐" * review['stars']
                rev = review['description']
                pfp = review['user']['profilePic32']['double']

                embedVar = discord.Embed(
                    title=f"{name} | Member Since {joined}", 
                    description=f"{rating}\n\n**__Review:__** \n{rev}\n\n", 
                    color=0xDB0B23
                )
                embedVar.set_footer(
                    text = f"{avg_review} out of 5⭐({total_count} reviews) | (Reviewed on {created_at})", 
                )
                embedVar.set_thumbnail(
                    url = pfp
                )

                await webhook.send(embed=embedVar)

    else:
        print('Request failed with status code:', response.status_code)

if __name__ == "__main__":
    asyncio.run(main())

You can see an example output review below.

2. Company Scraping - Patreon

For this project, I wanted to pull creators to get an idea of what services and types of content these creators make for their patrons. From there, the data could be separated (since Patreon doesn't categorize too well) and analyzed. You can find the code for this below.

import requests
import json
from string import ascii_lowercase as alc
import csv
import time

def patreon(search):
    url = f"https://patreon.com/api/search?q={search}&page%5Bnumber%5D=1"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Accept': 'application/vnd.api+json',
        'Accept-Encoding': 'utf-8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Referer': 'www.patreon.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'X-Requested-With': 'XMLHttpRequest',
        'App-Platform': 'web',
        'App-Version': '2023.11.12.01',
    }

    csv_element = ""

    response = requests.get(url=url, headers=headers)

    output = json.loads(response.text)

    # first, gather first page data then get the page for other pages.
    data_list = output['data']
    for element in data_list:
        csv_element = (csv_element + 
                        element['attributes']['creator_name'] + 
                        "," + element['attributes']['creation_name'] + 
                        "," + str(element['attributes']['post_statistics']['total']) + 
                        "," + str(element['attributes']['patron_count']) + 
                        "," + element['attributes']['url'] + "\n")

    page_total = output["meta"]["pages_total"]
    i = 2
    # now the other pages, if any.
    while i <= page_total:
        time.sleep(1)
        url = f"https://patreon.com/api/search?q={search}&page%5Bnumber%5D={i}"
        response = requests.get(url=url, headers=headers)
        output = json.loads(response.text)

        data_list = output['data']
        for element in data_list:
            csv_element = (csv_element + 
                            element['attributes']['creator_name'] + 
                            "," + element['attributes']['creation_name'] + 
                            "," + str(element['attributes']['post_statistics']['total']) + 
                            "," + str(element['attributes']['patron_count']) +
                            "," + element['attributes']['url'] + "\n")

        i = i + 1

    return csv_element

def main():
    mk = open("patreon_requests_data.csv", "w", encoding="utf-8")
    for i in alc:
        for j in alc:
            patreon_list = patreon(i + j)
            print("Search Complete for " + i + j + ", now adding to CSV.") 
            mk.write(patreon_list)
    
    mk.close()        

if __name__ == "__main__":
    main()