MIKE LEVIN AI SEO

Future-proof your skills with Linux, Python, vim & git as I share with you the most timeless and love-worthy tools in tech through my two great projects that work great together.

Fixing How OpenAI Couldn't Format Keywords Consistently

I'm working to fix OpenAI keyword problems by writing code to recognize text that doesn't match the keyword format, handling commas inside quotes, and creating histograms. I've used this code to re-write the database of keywords for my blog, validate YAML in Python without Jekyll, and create category pages and navigation. I'm committed to developing a code that can consistently format keywords.

Creating Consistent Keyword Formatting with OpenAI Code

By Michael Levin

Monday, April 17, 2023

I’m going to fix the OpenAI keyword problems. This gets us started by recognizing things that don’t match the “keyword one”, “keyword two” format. First, I catch singletons.

from sqlitedict import SqliteDict as sqldict


inside_comma = r'^("[^"]+",\s*)+"[^"]+"$'

with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        nv = None
        if not re.match(inside_comma, value):
            if value.count('"') == 2 and value[0] == '"' and value[-1] == '"':
                nv = value
        print(nv)

Fast-forward to handling the most common case, the comma inside the quotes:

"keyword one," "keyword two," "keyword three"

The problems are like this one and many variations. I’ve already bored you. Let’s get the code done!


```python
from collections import Counter
from sqlitedict import SqliteDict as sqldict


def good_pat(text):
    inside_comma = r'^("[^"]+",\s*)+"[^"]+"$'
    if not re.match(inside_comma, text):
        return False
    else:
        return True

def get_max_count_key(data):
    counts = Counter(data)
    max_count = max(counts.values())
    max_count_keys = [key for key, value in counts.items() if value == max_count]

    return max_count_keys[0] if max_count_keys else None


with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        nv = None
        if good_pat(value):
            nv = value
        else:
            # Do some setup for tests

            # Turn value into a list
            alist = value.split(" ")
            # Only keep list elements longer than 1
            alist = [x for x in alist if len(x) > 1]
            # Create a list of the 2nd-to-last characters
            comma_test = [x[-2] for x in alist]
            # Chreate a histogram of those characters
            counts = Counter(comma_test)
            # Get the most frequent ending character
            max_end = get_max_count_key(counts)

            if value.count('"') == 2 and value[0] == '"' and value[-1] == '"':
                # Simple case of the singleton
                nv = value
            elif max_end == ",":
                alist = value[1:-1]
                alist = alist.split('," "')
                last_item = alist[-1]
                last_item_last_char = last_item[-1]
                if last_item_last_char == '.':
                    alist[-1] = last_item_last_char[1:-1]
                alist = [f'"{x}"' for x in alist if x]
                astring = ", ".join(alist)
                if good_pat(astring):
                    nv = astring
        print(i, nv)
        print()
        if not nv:
            print(value)
            raise SystemExit()

Ugh! That got out of hand. I’m taking a new approach:

import re
from collections import Counter
from sqlitedict import SqliteDict as sqldict


def check_pat(text):
    inside_comma = r'(^("[^"]+",\s*)+"[^"]+"$)|(^"[^",]*"$)'
    if re.match(inside_comma, text):
        return text
    else:
        return None


with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        newlist = check_pat(value)
        if not newlist:
            noquotes = value.replace('"', '')
            alist = noquotes.split(", ")
            alist = [f'"{x}"' for x in alist]
            alist = ", ".join(alist)
            newlist = check_pat(alist)
            if not newlist:
                print(i, value)
                raise SystemExit()

And I developed that one nearly finished:

import re
from collections import Counter
from sqlitedict import SqliteDict as sqldict


def check_pat(text):
    """Checks for good quote-and-comma string format."""
    inside_comma = r'(^("[^"]+",\s*)+"[^"]+"$)|(^"[^",]*"$)'

    if re.match(inside_comma, text, re.MULTILINE) and "\n" not in text:
        return text
    else:
        return None


with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        newlist = check_pat(value)
        if not newlist and value.count('"') > 3:
            # Just strip out the quotes if not a singleton
            noquotes = value.replace('"', "")
            alist = noquotes.split(", ")
            alist = [f'"{x}"' for x in alist]
            alist = ", ".join(alist)
            newlist = check_pat(alist)
            if not newlist and "\n" in value:
                alist = value.split("\n")
                alist = [x.replace('"', "") for x in alist]
                alist = [f'"{x}"' for x in alist]
                alist = ", ".join(alist)
                newlist = check_pat(alist)
                if not newlist:
                    print(i, alist)
                    raise SystemExit()
        print(i, newlist)
        print()
print("Done")

And here’s the finished code that re-writes the database:

import re
from collections import Counter
from sqlitedict import SqliteDict as sqldict


def check_pat(text):
    """Checks for good quote-and-comma string format."""
    inside_comma = r'(^("[^"]+",\s*)+"[^"]+"$)|(^"[^",]*"$)'
    if re.match(inside_comma, text, re.MULTILINE) and "\n" not in text:
        return text
    else:
        return None


with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        newlist = check_pat(value)
        if not newlist and value.count('"') > 3:
            # Just strip out the quotes if not a singleton
            noquotes = value.replace('"', "")
            alist = noquotes.split(", ")
            alist = [f'"{x}"' for x in alist]
            alist = ", ".join(alist)
            newlist = check_pat(alist)
            if not newlist and "\n" in value:
                alist = value.split("\n")
                alist = [x.replace('"', "") for x in alist]
                alist = [f'"{x}"' for x in alist]
                alist = ", ".join(alist)
                newlist = check_pat(alist)
                if not newlist:
                    print(i, alist)
                    raise SystemExit()
            real_list = eval(f"[{newlist}]")
            assert (type(real_list)) == list
            db[key] = newlist
    db.commit()
print("Done")

And there were a few stragglers that had to be cleaned up:

with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        proc = False
        if value:
            check_pat(value)
            try:
                eval(f"[{value}]")
            except:
                proc = True
        if proc:
            alist = value.split(", ")
            alist = [x.replace('"', "") for x in alist]
            alist = [f'"{x}"' for x in alist]
            alist = ", ".join(alist)
            print(i, alist)
            db[key] = alist
    db.commit()

And now I can do a histogram of the keywords on my blog:

keywords = []
with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        alist = eval(f"[{value}]")
        alist = [x.lower() for x in alist if x]
        keywords += alist
kwhist = Counter(keywords)
kwhist = kwhist.most_common()
[x for x in kwhist if x[1] > 5]

And here’s the most popular keywords on my site. I’ll have to make category pages and navigation out of this.

[('python', 89),
 ('linux', 70),
 ('vim', 50),
 ('seo', 41),
 ('windows', 28),
 ('ai', 26),
 ('lxd', 25),
 ('jekyll', 22),
 ('jupyterlab', 20),
 ('git', 20),
 ('technology', 20),
 ('microsoft', 17),
 ('github pages', 15),
 ('wsl2', 15),
 ('wsl', 15),
 ('bing', 15),
 ('jupyter notebooks', 13),
 ('nbdev', 12),
 ('artificial intelligence', 12),
 ('youtube', 11),
 ('github', 11),
 ('jupyter notebook', 11),
 ('automation', 11),
 ('journaling', 11),
 ('data', 11),
 ('writing', 10),
 ('web development', 10),
 ('systemd', 10),
 ('jupyter', 9),
 ('unix', 9),
 ('windows 10', 9),
 ('windows subsystem for linux', 9),
 ('foss', 9),
 ('chatgpt', 9),
 ('website', 8),
 ('code', 8),
 ('productivity', 8),
 ('script', 8),
 ('blog', 8),
 ('markdown', 8),
 ('ubuntu', 8),
 ('api', 8),
 ('machine learning', 8),
 ('vscode', 8),
 ('NeoVim', 8),
 ('blog post', 7),
 ('google', 7),
 ('video', 7),
 ('pipulate', 7),
 ('coding', 7),
 ('html', 6),
 ('github.io', 6),
 ('google analytics', 6),
 ('documentation', 6),
 ('configuration', 6),
 ('ubuntu 20.04', 6),
 ('ubuntu 18.04', 6),
 ('challenges', 6),
 ('quantum mechanics', 6),
 ('open source', 6),
 ('google photos', 6),
 ('ufos', 6),
 ('openai', 6),
 ('mac', 6)]

And one final newsflash here. I had to do one more update of the keyword database:

keywords = []
with sqldict("topics.db") as db:
    for i, (key, value) in enumerate(db.iteritems()):
        if value:
            alist = value.split(", ")
            alist = [x.replace('"', "") for x in alist]
            # alist = [f'"{x}"' for x in alist]
            alist = ", ".join(alist)
            print(i, alist)
            db[key] = alist
    db.commit()

…because I kept getting YAML parsing errors when I tried to insert them Jekyll frontmatter. It’s related to the quotes. I wrote this program so I can validate YAML in Python without Jekyll:

import os
import yaml

folder_path = "/home/ubuntu/repos/hide/MikeLev.in/_posts"

for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as file:
            post_content = file.read()
            _, yaml_front_matter, _ = post_content.split("---", maxsplit=2)
            try:
                yaml.safe_load(yaml_front_matter)
                print(f"{file_path}: YAML front matter is valid.")
            except yaml.YAMLError as e:
                print(f"{file_path}: Error in YAML front matter:", e)

Categories