今さら言語処理100本ノック #05 後半

途中でかなりめんどくさくなってしまって関数名とか実行時間と考えず適当になったのは内緒。

自作ライブラリのインストール

MeCab と CaboCha の解析結果をパースしてオブジェクトとして扱えるようにするライブラリを作ったのでそれを使って解いていく。

Github: https://github.com/tomowarkar/cucurbita

git clone https://github.com/tomowarkar/cucurbita.git
pip install ./cucurbita

45

import CaboCha
from cucurbita.cab import Sect

text = "吾輩はここで始めて人間というものを見た"
c = CaboCha.Parser()
cabocha_result = c.parse(text).toString(CaboCha.FORMAT_LATTICE)
sect = Sect(cabocha_result)

def relations(sect):
  return [(c.pos, c.dst) for c in sect.chunks]

def nlp_45(sect):
    d = {}
    for pos, dst in relations(sect):
      if dst == -1:
        continue
      key = sect.chunks[dst]
      value = sect.chunks[pos]

      varbs = [m.base for m in key.morphs if m.pos == "動詞"]
      if not any(varbs):
        continue

      posts = [m.base for m in value.morphs if m.pos == "助詞"]
      if not any(posts):
        continue

      if d.get(varbs[0]):
          d[varbs[0]]+=posts
      else:
          d[varbs[0]] = posts
    return d

for k, v in nlp_45(sect).items():
  print("{}\t{}".format(k, " ".join(sorted(v))))

見る	は を
始める	で

46

def nlp_46(sect):
    d = {}
    for pos, dst in relations(sect):
      if dst == -1:
        continue
      key = sect.chunks[dst]
      value = sect.chunks[pos]

      varbs = [m.base for m in key.morphs if m.pos == "動詞"]
      if not any(varbs):
        continue

      posts = [m.base for m in value.morphs if m.pos == "助詞"]
      if not any(posts):
        continue

      if d.get(varbs[0]):
          d[varbs[0]]+=[value]
      else:
          d[varbs[0]] = [value]
    return d

for key, values in nlp_46(sect).items():
  f = lambda x: [m.base for m in x.morphs if m.pos == "助詞"][-1]
  values = sorted(values, key=f)
  print("{}\t{}\t{}".format(key, " ".join([f(value) for value in values]), " ".join(map(str, values))))

見る	は を	吾輩は ものを
始める	で	ここで

47

c = CaboCha.Parser()
text = "別段くるにも及ばんさと、主人は手紙に返事をする。"
cabocha_result = c.parse(text).toString(CaboCha.FORMAT_LATTICE)
sect = Sect(cabocha_result)

is_sahen_noun = lambda morph: morph.pos == '名詞' and morph.pos1 == 'サ変接続'
is_wo = lambda morph: morph.base == 'を' and morph.pos == '助詞'
is_target = lambda morphs:  is_sahen_noun(morphs[i]) and is_wo(morphs[i+1])

find_verb = lambda chunk: list(filter(lambda x: x.pos == "動詞", chunk.morphs))[0]

for chunk in sect.chunks:
  morphs = chunk.morphs
  for i in range(len(morphs) -1):
    verb = find_verb(sect.chunks[chunk.dst])
    if is_target(morphs) and verb:
      print(chunk, verb, sep="", end="\t")
      f = lambda x: [m.base for m in x.morphs if m.pos == "助詞"][-1]
      values =  [c for c in sect.chunks if c.dst == chunk.dst and c != chunk]
      values = sorted(values, key=f)
      print("{} {}".format(" ".join([f(value) for value in values]), " ".join(map(str, values))))

返事をする	と に は 及ばんさと、 手紙に 主人は

48

for c in sect.chunks:
  if [m for m in c.morphs if m.pos == "名詞"]:
    print(c, end="")
    dst = c.dst
    while dst > -1:
      print(" -> ", end="")
      print(sect.chunks[dst], end="")
      dst = sect.chunks[dst].dst
    print()

吾輩は -> 見た
ここで -> 始めて -> 人間という -> ものを -> 見た
人間という -> ものを -> 見た
ものを -> 見た

49

def noun2root(sect):
  for c in sect.chunks:
    if [m for m in c.morphs if m.pos == "名詞"]:
      list_ = [c]
      dst = c.dst
      while dst > -1:
        list_.append(sect.chunks[dst])
        dst = sect.chunks[dst].dst
      yield list_

routes = [e for e in noun2root(sect)]

from copy import deepcopy as dc

to_str = lambda chunks: "".join(map(str, chunks))
f = lambda chunk, string: "".join([m.surface if m.pos !="名詞" else string for m in chunk.morphs])
dupl = lambda list_: sorted([x for x in set(list_) if list_.count(x) > 1], key=list_.index)
filt = lambda chunks, strs, d, face: [str(e) if strs.index(str(e)) != 0 else f(e, face) for e in chunks if strs.index(str(e)) < strs.index(d)]

for i, n in enumerate(routes, start=1):
  for j in range(len(routes) - i):
    X, Y = dc(n), dc(routes[i+j])
    #文節iから構文木の根に至る経路上に文節jが存在する場合
    if str(Y[0]) in map(str, X):
      X[0] = f(X[0], "X")
      for e in X:
        if str(e) == str(Y[0]):
          print("Y")
          break
        print(e, end=" -> ")
    # else
    else:
      xl, yl = list(map(str, X)), list(map(str, Y))
      d = dupl(xl + yl)[0]
      print(" -> ".join(filt(X, xl, d, "X")), "|", " -> ".join(filt(Y, yl, d, "Y")), "|", d)

Xは | Yで -> 始めて -> 人間という -> ものを | 見た
Xは | Yという -> ものを | 見た
Xは | Yを | 見た
Xで -> 始めて -> Y
Xで -> 始めて -> 人間という -> Y
Xという -> Y

以下古いやつ

時間が空いてやり直したのがいくつかあるのでアーカイブ

fuctions

read_cabocha をv3に更新

import re
class Morph:
    """cabochaの形態素解析結果
    Args:
        line (str): e.x. '名前\t名詞,一般,*,*,*,*,名前,ナマエ,ナマエ'
    Attributes:
        surface (str): 表層形
        base (str): 基本形
        pos (str): 品詞
        pos1 (str): 品詞細分類1
    """
    def __init__(self, line):
        pattern = r"^([^,]*?)\t([^,]*?),([^,]*?)(?:,[^,]*?){4},([^,]*?)(?:(?:,[^,]*?){2})?$"
        try:
            [(self.surface, self.pos, self.pos1, self.base)] = re.findall(pattern, line)
        except:
            raise Exception(f"Invalid line pattern: \n\t{repr(line)}", )

    def __str__(self):
        return self.surface

class Chunk:
    def __init__(self, dst):
        self.morphs = self.srcs = []
        self.dst = dst

    def  __str__(self):
        return "".join(list(map(str, self.morphs)))

def read_cabocha_v3(text):
    sentences = text.split("EOS\n")
    p = re.compile(r"\*\ (\d+)\ (-1|\d+)D\ \d+\/\d+\ -?\d+\.\d+")
    for sentence in sentences:
        dep = p.findall(sentence)
        if not dep:
            continue

        chunks = []
        for (index, dst) in dep:
            c = Chunk(int(dst))
            c.srcs = [int(f) for f, t in dep if t == index]
            chunks.append(c)

        cnt = -1
        for line in sentence.splitlines():
            if p.match(line):
                cnt += 1
                continue
            else:
                chunks[cnt].morphs.append(Morph(line))
        yield chunks

with open("neko.txt.cabocha") as f:
    text = f.read()

sentences = list(read_cabocha_v3(text))

45. 動詞の格パターンの抽出

今回用いている文章をコーパスと見なし，日本語の述語が取りうる格を調査したい．動詞を述語，動詞に係っている文節の助詞を格と考え，述語と格をタブ区切り形式で出力せよ．ただし，出力は以下の仕様を満たすようにせよ．

動詞を含む文節において，最左の動詞の基本形を述語とする述語に係る助詞を格とする述語に係る助詞（文節）が複数あるときは，すべての助詞をスペース区切りで辞書順に並べる「吾輩はここで始めて人間というものを見た」という例文（neko.txt.cabocha の 8 文目）を考える．この文は「始める」と「見る」の２つの動詞を含み，「始める」に係る文節は「ここで」，「見る」に係る文節は「吾輩は」と「ものを」と解析された場合は，次のような出力になるはずである．

始める  で
見る    は を

このプログラムの出力をファイルに保存し，以下の事項を UNIX コマンドを用いて確認せよ．

コーパス中で頻出する述語と格パターンの組み合わせ「する」「見る」「与える」という動詞の格パターン（コーパス中で出現頻度の高い順に並べよ）

f = lambda x, y: [m.base for m in x if m.pos == y]
text = ""
for sentence in sentences:
    verbs_and_pptls = [(f(chunk.morphs, "動詞"), f(chunk.morphs, "助詞")) for chunk in sentence]
    for i, (v, p) in enumerate(verbs_and_pptls):
        if v:
            pptls = sum([verbs_and_pptls[s][1] for s in sentence[i].srcs], [])
            pptls = " ".join(sorted(pptls))
            if not pptls:
                continue
            text += f"{v[0]}\t{pptls}\n"

with open("neko.txt.corpus", "w") as f:
    f.write(text)

$ head -n 10 neko.txt.corpus
生れる	で
つく	が と
泣く	で
する	だけ て は
始める	で
見る	は を
聞く	で
捕える	を
煮る	て
食う	て

$ cat neko.txt.corpus | sort -t "\t" -k 1 | uniq -c | sort -n -r | head -n 20
 560 云う	と
 436 する	を
 247 思う	と
 199 ある	が
 184 なる	に
 174 する	に
 172 見る	て
 120 する	と
 119 する	が
 108 する	に を
  93 見る	を
  90 見える	と
  79 する	て を
  60 する	が を
  58 もつ	を
  57 する	は
  57 する	て
  55 云う	を
  53 ある	の
  50 出来る	が

$ cat neko.txt.corpus | grep "^する\t" | sort -t "\t" -k 1 | uniq -c | sort -n -r | head -n 5
 436 する	を
 174 する	に
 120 する	と
 119 する	が
 108 する	に を

$ cat neko.txt.corpus | grep "^見る\t" | sort -t "\t" -k 1 | uniq -c | sort -n -r | head -n 5
 172 見る	て
  93 見る	を
  19 見る	て を
  19 見る	て て
  19 見る	から

$ cat neko.txt.corpus | grep "^与える\t" | sort -t "\t" -k 1 | uniq -c | sort -n -r | head -n 5
   3 与える	に を
   2 与える	て に は を
   1 与える	ば を
   1 与える	に に対して のみ は は も
   1 与える	て も を

46. 動詞の格フレーム情報の抽出

45 のプログラムを改変し，述語と格パターンに続けて項（述語に係っている文節そのもの）をタブ区切り形式で出力せよ．45 の仕様に加えて，以下の仕様を満たすようにせよ．

項は述語に係っている文節の単語列とする（末尾の助詞を取り除く必要はない）述語に係る文節が複数あるときは，助詞と同一の基準・順序でスペース区切りで並べる「吾輩はここで始めて人間というものを見た」という例文（neko.txt.cabocha の 8 文目）を考える．この文は「始める」と「見る」の２つの動詞を含み，「始める」に係る文節は「ここで」，「見る」に係る文節は「吾輩は」と「ものを」と解析された場合は，次のような出力になるはずである．

始める  で      ここで
見る    は を   吾輩は ものを

def nlp100_46(sentence):
    text = ""
    f = lambda x, y: [m.base for m in x.morphs if m.pos == y]
    verbs = [f(chunk, "動詞") for chunk in sentence]
    for i, v in enumerate(verbs):
        if not v:
            continue
        pptls = [(sentence[src], f(sentence[src], "助詞")) for src in sentence[i].srcs if  f(sentence[src], "助詞")]
        if not pptls:
            continue
        pptls = sorted(pptls, key=lambda x: x[1][0])
        text += v[0] + "\t"
        text += " ".join([pptl[1][0] for pptl in pptls]) + "\t"
        text += " ".join([str(pptl[0]) for pptl in pptls]) + "\n"
    return text


print(nlp100_46(sentences[5]))

memo

サ変接続はpos1とpos2ででくるが、今回後者は対象外

$ cat neko.txt.cabocha | grep "サ変接続" | cut -f 2 | cut -d "," -f 1-3 | sort -r | uniq -c
4842 名詞,サ変接続,*
  21 名詞,接尾,サ変接続