jadb/migrations/0001_JMDict.sql

SELECT icu_load_collation('ja_JP', 'japanese');

CREATE TABLE "JMdict_Version" (
  "version" VARCHAR(10) PRIMARY KEY NOT NULL,
  "date" DATE NOT NULL,
  "hash" VARCHAR(64) NOT NULL
) WITHOUT ROWID;

CREATE TRIGGER "JMdict_Version_SingleRow"
BEFORE INSERT ON "JMdict_Version"
WHEN (SELECT COUNT(*) FROM "JMdict_Version") >= 1
BEGIN
    SELECT RAISE(FAIL, 'Only one row allowed in JMdict_Version');
END;

CREATE TABLE "JMdict_InfoDialect" (
  "id" VARCHAR(4) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

CREATE TABLE "JMdict_InfoField" (
  "id" VARCHAR(7) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

CREATE TABLE "JMdict_InfoKanji" (
  "id" VARCHAR(5) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

CREATE TABLE "JMdict_InfoMisc" (
  "id" VARCHAR(12) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

CREATE TABLE "JMdict_InfoPOS" (
  "id" VARCHAR(9) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

CREATE TABLE "JMdict_InfoReading" (
  "id" VARCHAR(5) PRIMARY KEY NOT NULL,
  "description" TEXT NOT NULL
) WITHOUT ROWID;

-- The XML specification says that an entry needs to have at least
-- one sense and one reading. I will just assume this is the case, and
-- not implement a check for it.

CREATE TABLE "JMdict_Entry" (
  "entryId" INTEGER PRIMARY KEY
);

-- KanjiElement

CREATE TABLE "JMdict_KanjiElement" (
  "elementId" INTEGER PRIMARY KEY,
  "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
  "orderNum" INTEGER NOT NULL,
  "reading" TEXT NOT NULL COLLATE japanese,
  "news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
  "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
  "spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2),
  "gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2),
  "nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48),
  -- UNIQUE("entryId", "reading"),
  UNIQUE("entryId", "orderNum")
) WITHOUT ROWID;

CREATE INDEX "JMdict_KanjiElement_byEntryId_byOrderNum" ON "JMdict_KanjiElement"("entryId", "orderNum");
CREATE INDEX "JMdict_KanjiElement_byReading" ON "JMdict_KanjiElement"("reading");

CREATE TABLE "JMdict_KanjiElementInfo" (
  "elementId" INTEGER NOT NULL REFERENCES "JMdict_KanjiElement"("elementId"),
  "info" TEXT NOT NULL REFERENCES "JMdict_InfoKanji"("id"),
  PRIMARY KEY ("elementId", "info")
) WITHOUT ROWID;

-- ReadingElement

CREATE TABLE "JMdict_ReadingElement" (
  "elementId" INTEGER PRIMARY KEY,
  "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
  "orderNum" INTEGER NOT NULL,
  "reading" TEXT NOT NULL COLLATE japanese,
  "readingDoesNotMatchKanji" BOOLEAN NOT NULL DEFAULT FALSE,
  "news" INTEGER CHECK ("news" BETWEEN 1 AND 2),
  "ichi" INTEGER CHECK ("ichi" BETWEEN 1 AND 2),
  "spec" INTEGER CHECK ("spec" BETWEEN 1 AND 2),
  "gai" INTEGER CHECK ("gai" BETWEEN 1 AND 2),
  "nf" INTEGER CHECK ("nf" BETWEEN 1 AND 48),
  -- UNIQUE("entryId", "reading"),
  UNIQUE("entryId", "orderNum")
) WITHOUT ROWID;

CREATE INDEX "JMdict_ReadingElement_byEntryId_byOrderNum" ON "JMdict_ReadingElement"("entryId", "orderNum");
CREATE INDEX "JMdict_ReadingElement_byReading" ON "JMdict_ReadingElement"("reading");

CREATE TABLE "JMdict_ReadingElementRestriction" (
  "elementId" INTEGER NOT NULL REFERENCES "JMdict_ReadingElement"("elementId"),
  "restriction" TEXT NOT NULL,
  PRIMARY KEY ("elementId", "restriction")
) WITHOUT ROWID;

CREATE TABLE "JMdict_ReadingElementInfo" (
  "elementId" INTEGER NOT NULL REFERENCES "JMdict_ReadingElement"("elementId"),
  "info" TEXT NOT NULL REFERENCES "JMdict_InfoReading"("id"),
  PRIMARY KEY ("elementId", "info")
) WITHOUT ROWID;

-- Sense

CREATE TABLE "JMdict_Sense" (
  "senseId" INTEGER PRIMARY KEY,
  "entryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
  "orderNum" INTEGER NOT NULL,
  UNIQUE("entryId", "orderNum")
);

CREATE INDEX "JMdict_Sense_byEntryId_byOrderNum" ON "JMdict_Sense"("entryId", "orderNum");

CREATE TABLE "JMdict_SenseRestrictedToKanji" (
  "entryId" INTEGER NOT NULL,
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "kanjiOrderNum" INTEGER NOT NULL CHECK ("kanjiOrderNum" > 0),
  FOREIGN KEY ("entryId", "kanjiOrderNum") REFERENCES "JMdict_KanjiElement"("entryId", "orderNum"),
  PRIMARY KEY ("entryId", "senseId", "kanjiOrderNum")
) WITHOUT ROWID;

CREATE TABLE "JMdict_SenseRestrictedToReading" (
  "entryId" INTEGER NOT NULL,
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "readingOrderNum" INTEGER NOT NULL CHECK ("readingOrderNum" > 0),
  FOREIGN KEY ("entryId", "readingOrderNum") REFERENCES "JMdict_ReadingElement"("entryId", "orderNum"),
  PRIMARY KEY ("entryId", "senseId", "readingOrderNum")
) WITHOUT ROWID;

-- In order to add xrefs, you will need to have added the entry to xref to.
-- These should be added in a second pass of the dictionary file.

-- In this version of JMdict, the xrefs can be ambiguous.
-- There has been rumours of a nonambiguous version possibly arriving in the future
-- (https://www.edrdg.org/jmdict_edict_list/2019/msg00360.html)
-- but for time being, this need to be modeled as a one to many relationship.

-- These two things also concern "SenseAntonym"

CREATE TABLE "JMdict_SenseSeeAlso" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "xrefEntryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
  -- Sometimes the cross reference is to a specific sense
  "xrefSenseOrderNum" INTEGER,
  -- For some entries, the cross reference is ambiguous. This means that while the ingestion
  -- has determined some xrefEntryId, it is not guaranteed to be the correct one.
  "ambiguous" BOOLEAN NOT NULL DEFAULT FALSE,
  FOREIGN KEY ("xrefEntryId", "xrefSenseOrderNum") REFERENCES "JMdict_Sense"("entryId", "orderNum"),
  UNIQUE("senseId", "xrefEntryId", "xrefSenseOrderNum")
);

CREATE TABLE "JMdict_SenseAntonym" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "xrefEntryId" INTEGER NOT NULL REFERENCES "JMdict_Entry"("entryId"),
  -- For some entries, the cross reference is ambiguous. This means that while the ingestion
  -- has determined some xrefEntryId, it is not guaranteed to be the correct one.
  "ambiguous" BOOLEAN NOT NULL DEFAULT FALSE,
  UNIQUE("senseId", "xrefEntryId")
);

-- These cross references are going to be mostly accessed from a sense
-- This will speed up the join.
CREATE INDEX "JMdict_SenseSeeAlso_bySenseId" ON "JMdict_SenseSeeAlso"("senseId");
CREATE INDEX "JMdict_SenseAntonym_bySenseId" ON "JMdict_SenseAntonym"("senseId");

CREATE TABLE "JMdict_SensePOS" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "pos" TEXT NOT NULL REFERENCES "JMdict_InfoPOS"("id"),
  PRIMARY KEY ("senseId", "pos")
) WITHOUT ROWID;

CREATE TABLE "JMdict_SenseField" (
  "senseId" INTEGER NOT NULL,
  "field" TEXT NOT NULL,
  FOREIGN KEY ("senseId") REFERENCES "JMdict_Sense"("senseId"),
  FOREIGN KEY ("field") REFERENCES "JMdict_InfoField"("id"),
  PRIMARY KEY ("senseId", "field")
) WITHOUT ROWID;

CREATE TABLE "JMdict_SenseMisc" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "misc" TEXT NOT NULL REFERENCES "JMdict_InfoMisc"("id"),
  PRIMARY KEY ("senseId", "misc")
) WITHOUT ROWID;

CREATE TABLE "JMdict_SenseLanguageSource" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "language" CHAR(3) NOT NULL DEFAULT "eng",
  "phrase" TEXT,
  "fullyDescribesSense" BOOLEAN NOT NULL DEFAULT TRUE,
  "constructedFromSmallerWords" BOOLEAN NOT NULL DEFAULT FALSE,
  UNIQUE("senseId", "language", "phrase")
);

CREATE TABLE "JMdict_SenseDialect" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "dialect" TEXT NOT NULL REFERENCES "JMdict_InfoDialect"("id"),
  PRIMARY KEY ("senseId", "dialect")
) WITHOUT ROWID;

-- In the documentation, it says that the glossary can contain
-- special prioritized entries, but I can't find a single one of those.
-- Neither can I find a glossary tag with g_gend data, so these parts
-- will be omitted.

CREATE TABLE "JMdict_SenseGlossary" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "phrase" TEXT NOT NULL,
  "language" CHAR(3) NOT NULL DEFAULT "eng",
  "type" TEXT,
  PRIMARY KEY ("senseId", "language", "phrase")
) WITHOUT ROWID;

CREATE INDEX "JMdict_SenseGlossary_byPhrase" ON JMdict_SenseGlossary("phrase");

CREATE TABLE "JMdict_SenseInfo" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "info" TEXT NOT NULL,
  PRIMARY KEY ("senseId", "info")
) WITHOUT ROWID;

-- There is not a single example sentence that doesn't come from
-- the Tanaka Corpus, so I will leave the type out for now.

CREATE TABLE "JMdict_ExampleSentence" (
  "senseId" INTEGER NOT NULL REFERENCES "JMdict_Sense"("senseId"),
  "word" TEXT NOT NULL,
  "source" TEXT NOT NULL,
  "sourceLanguage" CHAR(3) NOT NULL DEFAULT "eng",
  "japanese" TEXT NOT NULL
  -- "type" TEXT NOT NULL DEFAULT "tat",
);