A Script To Pull Keywords From MOZ Pro Keyword Research Ranking Keywords
This blog post provides a step-by-step guide to automating browser actions using Playwright Browser Automation and Jupyter Notebook to pull keywords from MOZ Pro's Keyword Research Ranking Keywords. Additionally, I explain how to set up the environment, execute the code, and download files from a website.
Automating Keyword Research With Playwright and Jupyter Notebook
By Michael Levin
Tuesday, April 18, 2023
It’s time to actually record actions in browser automation. My first step is to add the first bit of recorded actions at all. I’m not performing the actual form submit, but I’m going to record the actions that lead up to it.
Playwright Browser Automation in Jupyter Notebook: Recording With Codegen
Playwright Browser Automation: Stepping Through a List
And the final step. It’s not pretty, but it got the job done:
# | export
import nest_asyncio
nest_asyncio.apply()
import re
import asyncio
from pathlib import Path
from playwright.async_api import Playwright, async_playwright
PAUSE_TO_RECORD = False
DATA_DIR = "../data/"
load_from = DATA_DIR + "pull_keywords.txt"
slow_mo = 100
moz_creds = "/home/ubuntu/repos/moz/assets/mozcreds.txt"
chrome_exe = "/usr/bin/google-chrome"
downloads_path = "/home/ubuntu/Downloads"
user_data = "/home/ubuntu/.config/google-chrome/"
def in_notebook():
"""Return True if run from a Jupyter Notebook and False if not."""
try:
import IPython
if IPython.get_ipython().__class__.__name__ == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
else:
return False # Other type (likely a script)
except NameError:
return False # Probably standard Python interpreter
if in_notebook():
keyword = "example.com" # or set to any default value that you prefer
headless = False
else:
import argparse
headless = True
parser = argparse.ArgumentParser(
description="Pull keywords from MOZ Pro given -s site."
)
parser.add_argument("-s", "--site", type=str, required=True, help="Value for site")
args = parser.parse_args()
keyword = args.keyword
with open(moz_creds) as fh:
UN, PW = [x.strip().split(" ")[1] for x in fh.readlines()]
async def main():
for attempt in range(10):
if len(list(Path("/home/ubuntu/repos/moz/downloads/").glob("*.csv"))) == len(list(Path("/home/ubuntu/repos/moz/downloads/").glob("*.csv"))):
print("Done")
raise SystemExit()
try:
async with async_playwright() as playwright:
context = await playwright.chromium.launch_persistent_context(
viewport={"width": 1600, "height": 900},
downloads_path=downloads_path,
executable_path=chrome_exe,
user_data_dir=user_data,
accept_downloads=True,
headless=headless,
channel="chrome",
slow_mo=slow_mo,
)
page = await context.new_page()
await page.goto("https://moz.com/")
try:
await page.get_by_role("link", name="Log in").click()
await page.locator("#email").click()
await page.locator("#email").fill(UN)
await page.locator("#email").press("Tab")
await page.locator("#password").fill(PW)
await page.locator("#password").press("Enter")
except:
...
# Codegen activated
if PAUSE_TO_RECORD:
await page.pause() # Edit this line in for codegen and out for automation.
# -- BEGIN CODEGEN LINES --
await page.get_by_title("Moz Pro").click()
await page.get_by_role("link", name="Moz Pro Home").click()
await page.get_by_role("link", name="Keyword Research").click()
with open(load_from) as fh:
for i, line in enumerate(fh.readlines()):
site = line.strip()
# Build a set of everything already downloaded
seen = set()
for file in Path("/home/ubuntu/repos/moz/downloads/").glob("*.csv"):
seen_site = file.name.split("_")[1]
seen.add(seen_site)
if site not in seen:
print(i, site)
await page.get_by_role("link", name="Ranking Keywords").click()
await asyncio.sleep(3)
await page.locator("form").filter(has_text="root domainUnited States - en-USanalyze").locator("span").first.click()
await page.get_by_role("listitem").filter(has_text="subdomain").click()
await page.get_by_placeholder("Enter a subdomain (ex: news.mydomain.com) to find keywords that rank").click()
await page.get_by_placeholder("Enter a subdomain (ex: news.mydomain.com) to find keywords that rank").fill(site)
await page.get_by_role("button", name="analyze").click()
async with page.expect_download(timeout=5000000) as download_info:
await page.get_by_role("button", name="Export CSV").click()
download = await download_info.value
download = download_info.value
download = await download
await download.save_as(
"/home/ubuntu/repos/moz/downloads/" + download.suggested_filename
)
# When done, close the browser.
await asyncio.sleep(10)
await context.close()
except:
await asyncio.sleep(10)
...
async def run_main():
await main()
if in_notebook():
try:
asyncio.get_running_loop()
asyncio.run(run_main())
except RuntimeError as e:
if "no running event loop" in str(e):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_main())
else:
asyncio.run(run_main())